### Process all of the .csv files provided, combining all of the individual datasets into a single dataset for analysis

clearly demonstrate how you have chosen to deal with any data wrangling issues (e.g. missing data, unrecognised activity codes, incompatible date/time formats).

Ensure that different date/time formats are resolved, returning error messages or handling exceptions if incompatible .csv files are provided)

In [1]:
import numpy as np
import pandas as pd
import datetime

In [2]:
def read_and_merge_csvs(folder_name):
    from os import listdir
    from os.path import isfile, join

    try:
        csv_filenames = [f for f in listdir(folder_name) if '.csv' in f]
    
    except Exception as e:
        print("Error during reading csvs:", e)
    
    try:
        df = pd.concat([pd.read_csv(join(folder_name, f)) for f in csv_filenames])
        
    except Exception as e:
        print("Error during processing csvs:", e)
        
    return df

In [3]:
df = read_and_merge_csvs('data')

In [4]:
print(np.array(df.columns).reshape(-1, 3)) # finding wrongly named column names

[['PrimaryActivityCode' 'SecondaryActivityCode' 'DescriptiveLabel']
 ['StartDate' 'UniBathWeekNo' 'DayOfWeek']
 ['StartTime' 'EndTime' 'DurationMins']
 ['EnjoymentScore' 'DistractionScore' 'MultiTaskingScore']
 ['AnxietyStressLevel' 'Outside' 'Unnamed: 0']
 ['AdditionalData1' 'AdditionalData2' 'SocialEngagement']
 ['TimeExpectations' 'peace of mind score' 'productivity']
 ['DayofWeek' 'Snack' 'Healthiness']
 ['SocialMediaPlatformUsed' 'SleepQuality' 'TirednessScore']
 ['TaskCompletionScore' 'QualityofSleep' 'EnergyRating']
 ['AnxietyLevel' 'FocusLevel' 'DaydreamingScore']
 ['SocialMediaUsage' 'ImportanceLevel' 'pressure_score']
 ['quality_score' 'StressLevel' 'efficiency index of the activity']
 ['number of times we pick up phone ' 'Indoors' 'Screentime']
 ['Interesting' 'ConcentrationLevel' 'DayOfWeeks']
 ['Calorie' 'Degree of concentration'
  'Did you take rest/study/screen break during productive hours?']
 ['If you took break, what was the duration?' 'Timestamp'
  'Do you approve to

In [5]:
df = df.loc[:, :"EnjoymentScore"] # removing further columns

In [6]:
df

Unnamed: 0,PrimaryActivityCode,SecondaryActivityCode,DescriptiveLabel,StartDate,UniBathWeekNo,DayOfWeek,StartTime,EndTime,DurationMins,EnjoymentScore
0,CW982,,Reading a Book,2021-03-02,23.0,Tuesday,00:00,00:30,30.0,0.0
1,S801,,Sleeping,2021-03-02,23.0,Tuesday,00:45,07:30,405.0,2.0
2,H179,O733,,2021-03-02,23.0,Tuesday,07:45,08:30,45.0,0.0
3,ED152,L418,Breakfast,2021-03-02,23.0,Tuesday,08:45,09:15,30.0,2.0
4,CW982,,,2021-03-02,23.0,Tuesday,09:15,10:15,60.0,0.0
...,...,...,...,...,...,...,...,...,...,...
39,L418,UD415,playing switch game,2020-03-09,24.0,Tuesday,20:00:00,21:15:00,75.0,1.0
40,L418,UD415,using moblie,2020-03-09,24.0,Tuesday,21:15:00,22:15:00,60.0,1.0
41,O733,,preparing for sleep,2020-03-09,24.0,Tuesday,22:15:00,22:45:00,30.0,0.0
42,L418,UD415,watching videos,2020-03-09,24.0,Tuesday,22:45:00,23:45:00,60.0,1.0


#### finding missing/unrecognised data in the wrangled dataset

In [7]:
# Check missing/unrecognised Primary/SecondaryActivityCode
# capitalise and remove spaces in letters of activity codes
df["PrimaryActivityCode"] = df["PrimaryActivityCode"].str.upper()
df["PrimaryActivityCode"] = df["PrimaryActivityCode"].str.strip()
df["SecondaryActivityCode"] = df["SecondaryActivityCode"].str.upper()
df["SecondaryActivityCode"] = df["SecondaryActivityCode"].str.strip()

print("Converted Primary and Secondary activity code strings into upper letter")

# drop rows that has NaN primary codes
df = df.drop(pd.isnull(df["PrimaryActivityCode"]))

print("Dropped nan primary codes")

# check if primary code is missing
# Change secondary code to nan if Primary code == secondary code in same row
df['SecondaryActivityCode'].loc[df['PrimaryActivityCode'] == df['SecondaryActivityCode']] = np.nan

print("Converted secondary activity codes to NaN where it is the same to primary")

# change empty string ' ' in secondary code to np.nan
index = np.where(df['SecondaryActivityCode'].apply(lambda x: (x == ' ') or (x == '')))
df['SecondaryActivityCode'].iloc[index] = np.nan

print("Converted empty strings to NaN")


Converted Primary and Secondary activity code strings into upper letter
Dropped nan primary codes
Converted secondary activity codes to NaN where it is the same to primary
Converted empty strings to NaN


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
# verify
print(np.where(df['SecondaryActivityCode'].apply(lambda x: x == ' ')))
print(df[df['PrimaryActivityCode'] == df['SecondaryActivityCode']])

(array([], dtype=int64),)
Empty DataFrame
Columns: [PrimaryActivityCode, SecondaryActivityCode, DescriptiveLabel, StartDate, UniBathWeekNo, DayOfWeek, StartTime, EndTime, DurationMins, EnjoymentScore]
Index: []


In [9]:
# check if any invalid codes exist (not in the code list)
lst_ActivityCode = ['ED145', 'EL642', 'X893', 'H179', 'L418', 'R523', 'S801',
                          'T695', 'UD415', 'PW101', 'CW982', 'O733']

lst_invalid_PrimaryActivityCode = np.unique([i for i in df["PrimaryActivityCode"] if (i not in lst_ActivityCode)])
# don't drop rows with nan values in secondaryactivitycode using i==i (return False if nan)
lst_invalid_SecondaryActivityCode = np.unique([i for i in df["SecondaryActivityCode"] if (i not in lst_ActivityCode) and (i==i)])


print("invalid PrimaryActivityCode:\n", lst_invalid_PrimaryActivityCode)
print("invalid SecondaryActivityCode:\n", lst_invalid_SecondaryActivityCode)

# drop rows that contain invalid codes
primary_row_labels = [(i in lst_invalid_PrimaryActivityCode) or (i!=i) for i in df["PrimaryActivityCode"]]
secondary_row_labels = [i in lst_invalid_SecondaryActivityCode for i in df["SecondaryActivityCode"]]
row_labels = np.logical_or(primary_row_labels, secondary_row_labels)
row_labels = [not i for i in row_labels]  # valid list
df = df.iloc[row_labels]

print("Dropped rows with invalid activity codes")


invalid PrimaryActivityCode:
 ['CW892' 'DI297' 'ED152' 'EL162' 'LE452' 'R253' 'SHOPPING' 'UB415' 'nan']
invalid SecondaryActivityCode:
 ['CHR' 'ED152' 'ENT' 'ESS' 'OTH' 'PRO' 'SA114' 'SA151']
Dropped rows with invalid activity codes


  
  from ipykernel import kernelapp as app


In [10]:
# verify if I haven't deleted rows with NaN in secondary activity code
print("NaN in Primary:", any(pd.isnull(df["PrimaryActivityCode"])))
print("NaN in Secondary:", any(pd.isnull(df["SecondaryActivityCode"])))


lst_invalid_PrimaryActivityCode = np.unique([i for i in df["PrimaryActivityCode"] if (i not in lst_ActivityCode)])
# don't drop rows with nan values in secondaryactivitycode using i==i (return False if nan)
lst_invalid_SecondaryActivityCode = np.unique([i for i in df["SecondaryActivityCode"] if (i not in lst_ActivityCode) and (i==i)])


print("invalid PrimaryActivityCode:", lst_invalid_PrimaryActivityCode)
print("invalid SecondaryActivityCode:", lst_invalid_SecondaryActivityCode)



NaN in Primary: False
NaN in Secondary: True
invalid PrimaryActivityCode: []
invalid SecondaryActivityCode: []


In [11]:
df.reset_index(drop=True)

Unnamed: 0,PrimaryActivityCode,SecondaryActivityCode,DescriptiveLabel,StartDate,UniBathWeekNo,DayOfWeek,StartTime,EndTime,DurationMins,EnjoymentScore
0,H179,O733,,2021-03-02,23.0,Tuesday,07:45,08:30,45.0,0.0
1,CW982,,,2021-03-02,23.0,Tuesday,09:15,10:15,60.0,0.0
2,CW982,,,2021-03-02,23.0,Tuesday,10:15,11:30,75.0,0.0
3,L418,,Watching Television,2021-03-02,23.0,Tuesday,11:45,12:15,30.0,1.0
4,EL642,UD415,Online lecture,2021-03-02,23.0,Tuesday,12:15,13:15,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...
2700,L418,UD415,playing switch game,2020-03-09,24.0,Tuesday,20:00:00,21:15:00,75.0,1.0
2701,L418,UD415,using moblie,2020-03-09,24.0,Tuesday,21:15:00,22:15:00,60.0,1.0
2702,O733,,preparing for sleep,2020-03-09,24.0,Tuesday,22:15:00,22:45:00,30.0,0.0
2703,L418,UD415,watching videos,2020-03-09,24.0,Tuesday,22:45:00,23:45:00,60.0,1.0


In [12]:
# Check StartDate
# Check the date format is correct
# Check the date is in correct time frame (Feb~March 2021)
print('Added StartDateTime and EndDateTime columns that merge date and time values')

df['StartDate'] = pd.to_datetime(df['StartDate'], errors='coerce')
print("Converted StartTime string to datetime format")

# drop rows where date is nan
df = df.loc[df['StartDate'] == df['StartDate']]
# df['StartDate'] = pd.to_datetime(df['StartDate'], errors='coerce')
print("Dropped rows with NaN dates")

Added StartDateTime and EndDateTime columns that merge date and time values
Converted StartTime string to datetime format
Dropped rows with NaN dates


In [13]:
df['StartDate'].unique()

array(['2021-03-02T00:00:00.000000000', '2021-03-03T00:00:00.000000000',
       '2021-03-04T00:00:00.000000000', '2021-08-03T00:00:00.000000000',
       '2021-09-03T00:00:00.000000000', '2021-10-03T00:00:00.000000000',
       '2021-03-08T00:00:00.000000000', '2021-03-09T00:00:00.000000000',
       '2021-03-10T00:00:00.000000000', '2021-03-07T00:00:00.000000000',
       '2020-09-03T00:00:00.000000000', '2020-10-03T00:00:00.000000000',
       '2020-11-03T00:00:00.000000000', '2021-06-03T00:00:00.000000000',
       '2021-07-03T00:00:00.000000000', '2021-11-03T00:00:00.000000000',
       '2021-12-03T00:00:00.000000000', '2021-01-03T00:00:00.000000000',
       '2020-01-03T00:00:00.000000000', '2021-02-03T00:00:00.000000000',
       '2021-04-03T00:00:00.000000000', '2021-05-03T00:00:00.000000000',
       '2021-03-01T00:00:00.000000000', '2021-02-04T00:00:00.000000000',
       '2021-03-11T00:00:00.000000000', '2021-03-06T00:00:00.000000000',
       '2021-02-25T00:00:00.000000000', '2021-02-26

There are some StartDate values with swapped month and date. It is likely that 2021-06-03 is actually 2021-03-06. This issue needs to be addressed as dropping these rows will reduce the amount of data significantly.

In [14]:
def month_date_swapper(StartDate):
    
    import datetime as dt
    
    months = np.array(StartDate.dt.month)
    dates = np.array(StartDate.dt.day)
    
    for i, date in enumerate(StartDate):
#         print(date, months[i], dates[i], ((months[i] != 2) and (months[i] != 3)), ((dates[i] == 2) or (dates[i] == 3)))
        if ((months[i] != 2) and (months[i] != 3)) and ((dates[i] == 2) or (dates[i] == 3)):
            StartDate.iloc[i] = dt.datetime.strftime(date, "%Y-%d-%m %H:%M:%S")
    
    StartDate = pd.to_datetime(StartDate, errors='coerce')        
    return StartDate

In [15]:
df['StartDate'] = month_date_swapper(df['StartDate'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [16]:
df['StartDate'].unique()

array(['2021-03-02T00:00:00.000000000', '2021-03-03T00:00:00.000000000',
       '2021-03-04T00:00:00.000000000', '2021-03-08T00:00:00.000000000',
       '2021-03-09T00:00:00.000000000', '2021-03-10T00:00:00.000000000',
       '2021-03-07T00:00:00.000000000', '2020-03-09T00:00:00.000000000',
       '2020-03-10T00:00:00.000000000', '2020-03-11T00:00:00.000000000',
       '2021-03-06T00:00:00.000000000', '2021-03-11T00:00:00.000000000',
       '2021-03-12T00:00:00.000000000', '2021-03-01T00:00:00.000000000',
       '2020-03-01T00:00:00.000000000', '2021-02-03T00:00:00.000000000',
       '2021-03-05T00:00:00.000000000', '2021-02-04T00:00:00.000000000',
       '2021-02-25T00:00:00.000000000', '2021-02-26T00:00:00.000000000',
       '2021-02-27T00:00:00.000000000', '2021-02-23T00:00:00.000000000',
       '2021-02-24T00:00:00.000000000', '2021-02-02T00:00:00.000000000',
       '2019-02-08T00:00:00.000000000', '2021-02-28T00:00:00.000000000',
       '2020-03-07T00:00:00.000000000', '2020-03-08

In [17]:
mask = ((df['StartDate'] >= '2021-02-19') & (df['StartDate'] <= '2021-03-12')) # date that this coursework was set and due
df = df.loc[mask]
print("Dropped rows where date is outside the range of coursework dates")

Dropped rows where date is outside the range of coursework dates


In [18]:
df.reset_index(drop=True)

Unnamed: 0,PrimaryActivityCode,SecondaryActivityCode,DescriptiveLabel,StartDate,UniBathWeekNo,DayOfWeek,StartTime,EndTime,DurationMins,EnjoymentScore
0,H179,O733,,2021-03-02,23.0,Tuesday,07:45,08:30,45.0,0.0
1,CW982,,,2021-03-02,23.0,Tuesday,09:15,10:15,60.0,0.0
2,CW982,,,2021-03-02,23.0,Tuesday,10:15,11:30,75.0,0.0
3,L418,,Watching Television,2021-03-02,23.0,Tuesday,11:45,12:15,30.0,1.0
4,EL642,UD415,Online lecture,2021-03-02,23.0,Tuesday,12:15,13:15,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...
2379,H179,,Cleaning House,2021-03-09,24.0,Tuesday,10:00:00,11:15:00,75.0,-2.0
2380,ED145,,Cooking and Eating,2021-03-09,24.0,Tuesday,11:30:00,13:00:00,90.0,0.0
2381,L418,,Games,2021-03-09,24.0,Tuesday,13:15:00,15:15:00,120.0,2.0
2382,S801,,missing,2021-03-09,24.0,Tuesday,15:30:00,17:00:00,90.0,2.0


In [19]:
# Check UniBathWeekNo
# Check the week range is between 20~25 and integer
df['UniBathWeekNo'].unique()

array([23., 24., 22., 10., 25., nan])

In [20]:
# drop NaN and values not between 20-25
mask = (df['UniBathWeekNo'] >= 20) & (df['UniBathWeekNo'] <= 25)
df = df[mask]

In [21]:
df.reset_index(drop=True)

Unnamed: 0,PrimaryActivityCode,SecondaryActivityCode,DescriptiveLabel,StartDate,UniBathWeekNo,DayOfWeek,StartTime,EndTime,DurationMins,EnjoymentScore
0,H179,O733,,2021-03-02,23.0,Tuesday,07:45,08:30,45.0,0.0
1,CW982,,,2021-03-02,23.0,Tuesday,09:15,10:15,60.0,0.0
2,CW982,,,2021-03-02,23.0,Tuesday,10:15,11:30,75.0,0.0
3,L418,,Watching Television,2021-03-02,23.0,Tuesday,11:45,12:15,30.0,1.0
4,EL642,UD415,Online lecture,2021-03-02,23.0,Tuesday,12:15,13:15,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...
2306,H179,,Cleaning House,2021-03-09,24.0,Tuesday,10:00:00,11:15:00,75.0,-2.0
2307,ED145,,Cooking and Eating,2021-03-09,24.0,Tuesday,11:30:00,13:00:00,90.0,0.0
2308,L418,,Games,2021-03-09,24.0,Tuesday,13:15:00,15:15:00,120.0,2.0
2309,S801,,missing,2021-03-09,24.0,Tuesday,15:30:00,17:00:00,90.0,2.0


In [22]:
# Add StartDateTime and EndDataTime columns
df['StartDateTime'] = df['StartDate'].dt.strftime("%Y-%m-%d") + ' ' + df['StartTime']
df['EndDateTime'] = df['StartDate'].dt.strftime("%Y-%m-%d") + ' ' + df['EndTime']
df['StartDateTime'] = pd.to_datetime(df['StartDateTime'], errors='coerce')
df['EndDateTime'] = pd.to_datetime(df['EndDateTime'], errors='coerce')

print("Added StartDateTime and EndDataTime columns")
print("NaN values in StartDateTime:", any(df['StartDateTime'] != df['StartDateTime']))
print("NaN values in EndDateTime:", any(df['EndDateTime'] != df['EndDateTime']))

Added StartDateTime and EndDataTime columns
NaN values in StartDateTime: True
NaN values in EndDateTime: True


In [23]:
df

Unnamed: 0,PrimaryActivityCode,SecondaryActivityCode,DescriptiveLabel,StartDate,UniBathWeekNo,DayOfWeek,StartTime,EndTime,DurationMins,EnjoymentScore,StartDateTime,EndDateTime
2,H179,O733,,2021-03-02,23.0,Tuesday,07:45,08:30,45.0,0.0,2021-03-02 07:45:00,2021-03-02 08:30:00
4,CW982,,,2021-03-02,23.0,Tuesday,09:15,10:15,60.0,0.0,2021-03-02 09:15:00,2021-03-02 10:15:00
5,CW982,,,2021-03-02,23.0,Tuesday,10:15,11:30,75.0,0.0,2021-03-02 10:15:00,2021-03-02 11:30:00
6,L418,,Watching Television,2021-03-02,23.0,Tuesday,11:45,12:15,30.0,1.0,2021-03-02 11:45:00,2021-03-02 12:15:00
7,EL642,UD415,Online lecture,2021-03-02,23.0,Tuesday,12:15,13:15,60.0,1.0,2021-03-02 12:15:00,2021-03-02 13:15:00
...,...,...,...,...,...,...,...,...,...,...,...,...
14,H179,,Cleaning House,2021-03-09,24.0,Tuesday,10:00:00,11:15:00,75.0,-2.0,2021-03-09 10:00:00,2021-03-09 11:15:00
15,ED145,,Cooking and Eating,2021-03-09,24.0,Tuesday,11:30:00,13:00:00,90.0,0.0,2021-03-09 11:30:00,2021-03-09 13:00:00
16,L418,,Games,2021-03-09,24.0,Tuesday,13:15:00,15:15:00,120.0,2.0,2021-03-09 13:15:00,2021-03-09 15:15:00
17,S801,,missing,2021-03-09,24.0,Tuesday,15:30:00,17:00:00,90.0,2.0,2021-03-09 15:30:00,2021-03-09 17:00:00


In [24]:
mask = (df['StartDateTime'] == df['StartDateTime']) | (df['EndDateTime'] == df['EndDateTime'])
df = df[mask]
print("Dropped rows with invalid datetime values")

Dropped rows with invalid datetime values


In [25]:
df.reset_index(drop=True)

Unnamed: 0,PrimaryActivityCode,SecondaryActivityCode,DescriptiveLabel,StartDate,UniBathWeekNo,DayOfWeek,StartTime,EndTime,DurationMins,EnjoymentScore,StartDateTime,EndDateTime
0,H179,O733,,2021-03-02,23.0,Tuesday,07:45,08:30,45.0,0.0,2021-03-02 07:45:00,2021-03-02 08:30:00
1,CW982,,,2021-03-02,23.0,Tuesday,09:15,10:15,60.0,0.0,2021-03-02 09:15:00,2021-03-02 10:15:00
2,CW982,,,2021-03-02,23.0,Tuesday,10:15,11:30,75.0,0.0,2021-03-02 10:15:00,2021-03-02 11:30:00
3,L418,,Watching Television,2021-03-02,23.0,Tuesday,11:45,12:15,30.0,1.0,2021-03-02 11:45:00,2021-03-02 12:15:00
4,EL642,UD415,Online lecture,2021-03-02,23.0,Tuesday,12:15,13:15,60.0,1.0,2021-03-02 12:15:00,2021-03-02 13:15:00
...,...,...,...,...,...,...,...,...,...,...,...,...
2294,H179,,Cleaning House,2021-03-09,24.0,Tuesday,10:00:00,11:15:00,75.0,-2.0,2021-03-09 10:00:00,2021-03-09 11:15:00
2295,ED145,,Cooking and Eating,2021-03-09,24.0,Tuesday,11:30:00,13:00:00,90.0,0.0,2021-03-09 11:30:00,2021-03-09 13:00:00
2296,L418,,Games,2021-03-09,24.0,Tuesday,13:15:00,15:15:00,120.0,2.0,2021-03-09 13:15:00,2021-03-09 15:15:00
2297,S801,,missing,2021-03-09,24.0,Tuesday,15:30:00,17:00:00,90.0,2.0,2021-03-09 15:30:00,2021-03-09 17:00:00


In [26]:
df[df.PrimaryActivityCode=='CW982']

Unnamed: 0,PrimaryActivityCode,SecondaryActivityCode,DescriptiveLabel,StartDate,UniBathWeekNo,DayOfWeek,StartTime,EndTime,DurationMins,EnjoymentScore,StartDateTime,EndDateTime
4,CW982,,,2021-03-02,23.0,Tuesday,09:15,10:15,60.0,0.0,2021-03-02 09:15:00,2021-03-02 10:15:00
5,CW982,,,2021-03-02,23.0,Tuesday,10:15,11:30,75.0,0.0,2021-03-02 10:15:00,2021-03-02 11:30:00
14,CW982,,,2021-03-02,23.0,Tuesday,19:15,20:00,45.0,0.0,2021-03-02 19:15:00,2021-03-02 20:00:00
16,CW982,,,2021-03-02,23.0,Tuesday,21:00,23:45,165.0,0.0,2021-03-02 21:00:00,2021-03-02 23:45:00
17,CW982,,,2021-03-03,23.0,Wednesday,00:00,00:30,30.0,0.0,2021-03-03 00:00:00,2021-03-03 00:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...
29,CW982,UD415,,2021-03-03,23.0,Wednesday,18:15,22:15,240.0,-1.0,2021-03-03 18:15:00,2021-03-03 22:15:00
31,CW982,UD415,,2021-03-03,23.0,Wednesday,22:30,23:45,75.0,-2.0,2021-03-03 22:30:00,2021-03-03 23:45:00
6,CW982,UD415,ML Coursework,2021-03-09,24.0,Tuesday,00:00:00,23:45:00,1425.0,1.0,2021-03-09 00:00:00,2021-03-09 23:45:00
12,CW982,UD415,ML Coursework,2021-03-09,24.0,Tuesday,00:00:00,23:45:00,1425.0,1.0,2021-03-09 00:00:00,2021-03-09 23:45:00


In [27]:
# check EnjoymentScore
df.EnjoymentScore.unique()

array([ 0.,  1., nan,  2., -2., -1.])

In [28]:
print(df['StartDateTime'].min())
print(df['EndDateTime'].max())

2021-02-23 08:30:00
2021-03-12 23:45:00


In [29]:
df.to_csv('processed_data.csv', index=False)