In [1]:
import time

import pandas as pd
# Turn off user warnings
import warnings
warnings.filterwarnings('ignore')

# Check if activity is sleep 60 minutes after the last sleep activity
def checkSleep(df):
    df_new = pd.DataFrame(columns=df.columns)
    i = 0
    while i < len(df) - 1:
        # Calculate progress
        print('{}/{}'.format(i, len(df) - 1), end='\r')

        current_activity = df.iloc[i]['Activity']
        next_activity = df.iloc[i+1]['Activity']

        if current_activity == 'Sleep':
            j = i + 1

            if j > len(df) - 1:
                print('Break')
                break
            
            interuption_duration = df.iloc[j]['Duration']
            while interuption_duration < 60:
                j += 1
                interuption_duration += df.iloc[j]['Duration']
            interuption_duration -= df.iloc[j]['Duration']

            if df.iloc[j]['Activity'] == 'Sleep':
                # Interrupted sleep activity
                num_interruptions = j - i - 1

                # Merge the sleep activities
                new_row = {
                    'Activity': 'Sleep',
                    'Start time': df.iloc[i]['Start time'],
                    'Duration': df.iloc[i]['Duration'] + df.iloc[j]['Duration'],
                    'Number of interruptions': df.iloc[i]['Number of interruptions'] + df.iloc[j]['Number of interruptions'] + num_interruptions,
                    'Duration of interruptions': df.iloc[i]['Duration of interruptions'] + df.iloc[j]['Duration of interruptions'] + interuption_duration,
                    'Day of activity': df.iloc[i]['Day of activity'],
                    'Weekend or weekday': df.iloc[i]['Weekend or weekday']
                }
                df_new = df_new.append(new_row, ignore_index=True)
                i = j + 1
            
            else:
                # New activity
                df_new = df_new.append(df.iloc[i], ignore_index=True)
                i += 1

        else:
            # Not a sleep activity
            df_new = df_new.append(df.iloc[i], ignore_index=True)
            i += 1

    # Add the last activity
    df_new = df_new.append(df.iloc[len(df)-1], ignore_index=True)
    return df_new


def find_sleep_interruptions(df_activity, verbose=False):
    total_interruptions = df_activity['Number of interruptions'].sum()
    new_total_interruptions = 10

    while total_interruptions != new_total_interruptions:
        total_interruptions = new_total_interruptions
        df_activity = checkSleep(df_activity)
        new_total_interruptions = df_activity['Number of interruptions'].sum()

        if verbose:
            print('Sum of durations: ', df_activity['Duration'].sum())
            print('Sum of interruptions: ', df_activity['Number of interruptions'].sum())
            print('Sum of duration of interruptions: ', df_activity['Duration of interruptions'].sum())
            print('Sum of total duration: ', df_activity['Duration'].sum() + df_activity['Duration of interruptions'].sum())
            print('Size of df_activity: ', len(df_activity))

    return df_activity


def split_data(path='Annotations.csv', noise_threshold=1, verbose=False):
    # Open the csv file
    df = pd.read_csv(path)

    # Duration from start to end in minutes
    start = df['DateTime'][0]
    end = df['DateTime'][len(df)-1]
    duration = (pd.to_datetime(end) - pd.to_datetime(start)).total_seconds()/60

    data = {
        'Activity': [],
        'Start time': [],
        'Duration': [], # in minutes
        'Number of interruptions': [], # Number of times sleep was interrupted. Over 60 minutes, considered a new activity
        'Duration of interruptions': [], # in minutes
        'Day of activity': [], # 0 = Monday --> 6 = Sunday
        'Weekend or weekday': [], # 0 = weekday, 1 = weekend
    }

    # Initialize variables
    activity = df['Activity'][0]
    start_time = df['DateTime'][0].split('.')[0]
    num_interruptions = 0
    duration_of_interruptions = 0
    weekday = df['Weekday'][0]
    weekend = df['Weekend'][0]

    exact_time = time.strptime(start_time, '%Y-%m-%d %H:%M:%S')


    # Iterate through the rows. Cannot save duration of activity until the next activity is found
    for i in range(len(df)):
        if activity != df['Activity'][i]:
            # New activity found, save the previous activity
            data['Activity'].append(activity)
            data['Start time'].append(start_time)
            data['Duration'].append((time.mktime(time.strptime(df['DateTime'][i].split('.')[0], '%Y-%m-%d %H:%M:%S')) - time.mktime(exact_time)) / 60)
            data['Number of interruptions'].append(num_interruptions)
            data['Duration of interruptions'].append(duration_of_interruptions)
            data['Day of activity'].append(weekday)
            data['Weekend or weekday'].append(weekend)

            # Reset variables
            activity = df['Activity'][i]
            start_time = df['DateTime'][i].split('.')[0]
            num_interruptions = 0
            duration_of_interruptions = 0
            weekday = df['Weekday'][i]
            weekend = df['Weekend'][i]
            exact_time = time.strptime(start_time, '%Y-%m-%d %H:%M:%S')
            
    df_activity = pd.DataFrame(data)        
    df_activity.head()
    # If duration of activity is less than 1 minute, remove it
    df_activity = df_activity[df_activity['Duration'] > noise_threshold]

    if verbose:
        print('Sum of durations: ', df_activity['Duration'].sum())
        print('Size of df_activity: ', len(df_activity))

        print('Sum of durations: ', df_activity['Duration'].sum())
        print('Size of df_activity: ', len(df_activity))

    df_activity = find_sleep_interruptions(df_activity, verbose=verbose)
    
    # Change time to HH:MM:SS
    def changeTime(time):
        return time.split(' ')[1]

    df_activity['Start time short'] = df_activity['Start time'].apply(changeTime)

    return df_activity

In [37]:
df

Unnamed: 0,Activity,Start time,Duration,Number of interruptions,Duration of interruptions,Day of activity,Weekend or weekday
0,Toilet,6.453056,3.633333,0,0.0,Friday,0
1,Other_Activity,6.522500,1.383333,0,0.0,Friday,0
2,Other_Activity,6.550000,4.850000,0,0.0,Friday,0
3,Bathe,6.630833,7.816667,0,0.0,Friday,0
4,Groom,6.761111,4.683333,0,0.0,Friday,0
...,...,...,...,...,...,...,...
1149,Work_On_Computer,-3.272500,27.850000,0,0.0,Sunday,1
1150,Wash_Dinner_Dishes,-3.739167,1.216667,0,0.0,Sunday,1
1151,Work_On_Computer,-3.761944,53.966667,0,0.0,Sunday,1
1152,Personal_Hygiene,-2.668056,2.233333,0,0.0,Sunday,1


In [36]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import time

import pandas as pd

def clean_df(df_activity_clean):

    df_activity_clean['Start time short']
    # If hour > 12, 24 - hour
    def changeTime(time):
        try:
            hour = int(time.split(':')[0])
            minute = int(time.split(':')[1])
            second = int(time.split(':')[2])
            if hour > 11:
                hour = hour - 24
                minute = 60 - minute
                second = 60 - second
        except Exception as e:
            print('time: ', time)
        return hour + minute/60 + second/3600

    df_activity_clean['Start time'] = df_activity_clean['Start time short'].apply(changeTime)
    df_activity_clean.drop(columns=['Start time short'], inplace=True)

    return df_activity_clean

def train_test_split(df_activity):

    def isDayBefore(start_time):
        day31 = '2011-07-15'
        timeConverted = time.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        return timeConverted < time.strptime(day31, '%Y-%m-%d')

    # Test data is last 31 days
    def isDayAfter(start_time):
        day31 = '2011-07-15'
        timeConverted = time.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        return timeConverted > time.strptime(day31, '%Y-%m-%d')


    # All activities split into training and test data
    df_train = df_activity[df_activity['Start time'].apply(isDayBefore)]
    df_test = df_activity[df_activity['Start time'].apply(isDayAfter)]

    # Reset index
    df_test.reset_index(drop=True, inplace=True)

    return df_train, df_test

# Split into sleep and non sleep
def sleep_split(df_train, df_test):
    df_train.to_csv('Unscaled/df_train_combined.csv', index=False)
    df_test.to_csv('Unscaled/df_test_combined.csv', index=False)

    df_train_sleep = df_train[df_train['Activity'] == 'Sleep']
    df_train_sleep = df_train_sleep.drop(columns=['Activity'])
    df_test_sleep = df_test[df_test['Activity'] == 'Sleep']
    df_test_sleep = df_test_sleep.drop(columns=['Activity'])

    df_train_other = df_train[df_train['Activity'] != 'Sleep']
    df_test_other = df_test[df_test['Activity'] != 'Sleep']        # Drop interruptions

    # Create copy with dropped interruptions
    df_train_otherS = df_train_other.drop(columns=['Number of interruptions', 'Duration of interruptions'])
    df_test_otherS = df_test_other.drop(columns=['Number of interruptions', 'Duration of interruptions'])


    # Save df
    df_train_otherS.to_csv('Unscaled/df_train_other.csv', index=False)
    df_test_otherS.to_csv('Unscaled/df_test_other.csv', index=False)

    # Save df
    df_train_sleep.to_csv('Unscaled/df_train_sleep.csv', index=False)
    df_test_sleep.to_csv('Unscaled/df_test_sleep.csv', index=False)

    return [df_train_sleep, df_test_sleep, df_train_other, df_test_other]


def enumerate_activities(df_train, df_test):
    unique_activities = df_train['Activity'].unique()
    activity_dict = [{'Activity': activity, 'Number': i} for i, activity in enumerate(unique_activities)]

    # Replace activity with number in training and test data
    def replaceActivity(activity):
        for i in range(len(activity_dict)):
            if activity == activity_dict[i]['Activity']:
                return activity_dict[i]['Number']
            
    df_train['Activity'] = df_train['Activity'].apply(replaceActivity)
    df_test['Activity'] = df_test['Activity'].apply(replaceActivity)

    # Drop interruptions
    df_train.drop(columns=['Number of interruptions', 'Duration of interruptions'], inplace=True)
    df_test.drop(columns=['Number of interruptions', 'Duration of interruptions'], inplace=True)

    return df_train, df_test



def standardise(df_train, df_test, type):
    # Convert time to float
    def convertTime(time):
        time = time.split(' ')[1]
        hour = int(time.split(':')[0])
        minute = int(time.split(':')[1])
        second = int(time.split(':')[2])
        return hour + minute / 60 + second / 3600

    df_train['Start time'] = df_train['Start time'].apply(convertTime)
    df_test['Start time'] = df_test['Start time'].apply(convertTime)   

    # Transform day of week to number
    def dayOfWeek(day):
        if day == 'Monday':
            return 0
        elif day == 'Tuesday':
            return 1
        elif day == 'Wednesday':
            return 2
        elif day == 'Thursday':
            return 3
        elif day == 'Friday':
            return 4
        elif day == 'Saturday':
            return 5
        elif day == 'Sunday':
            return 6

    df_train['Day of activity'] = df_train['Day of activity'].apply(dayOfWeek)
    df_test['Day of activity'] = df_test['Day of activity'].apply(dayOfWeek)

    if type != '_sleep':
        df_train, df_test = enumerate_activities(df_train, df_test)


    df_train = clean_df(df_train)
    df_test = clean_df(df_test)

    header = ','.join(df_train.columns)

    scaler = StandardScaler()
    scaler.fit(df_train)

    df_train = scaler.transform(df_train)
    df_test = scaler.transform(df_test)

    np.savetxt(f'df_train{type}.csv', df_train, delimiter=',', header=header, comments='')
    np.savetxt(f'df_test{type}.csv', df_test, delimiter=',', header=header, comments='')


In [39]:
# Import standardisation libraries
from sklearn.preprocessing import StandardScaler

# Import data
df_activity = pd.read_csv('Activity.csv')
df_activity_clean = df_activity.copy()

# Separate training and test data
df_train, df_test = train_test_split(df_activity)

standardise(df_train.copy(), df_test.copy(), '_combined')

# Split into sleep and non sleep
df_train_sleep, df_test_sleep, df_train_other, df_test_other = sleep_split(df_train, df_test)

standardise(df_train_sleep, df_test_sleep, '_sleep')
standardise(df_train_other, df_test_other, '_other')

