In [145]:
# Import standardisation libraries
from sklearn.preprocessing import StandardScaler#
import numpy as np
from sklearn.preprocessing import StandardScaler
import time
import pandas as pd


def train_test_split(df_freq):
    day31 = '2011-07-15'
    converted_day31 = pd.to_datetime(day31).date()

    # All activities split into training and test data
    df_train = df_freq[df_freq['Date'] < converted_day31]
    df_test = df_freq[df_freq['Date'] >= converted_day31]

    # Reset index
    df_test.reset_index(drop=True, inplace=True)

    # Save to csv
    df_train.to_csv('Unscaled/df_train_frequency.csv', index=False)
    df_test.to_csv('Unscaled/df_test_frequency.csv', index=False)

    return df_train, df_test



def standardise(df_train, df_test, type):
    # Drop date column
    df_train.drop('Date', axis=1, inplace=True)
    df_test.drop('Date', axis=1, inplace=True)

    # Transform day of week to number
    def dayOfWeek(day):
        if day == 'Monday':
            return 0
        elif day == 'Tuesday':
            return 1
        elif day == 'Wednesday':
            return 2
        elif day == 'Thursday':
            return 3
        elif day == 'Friday':
            return 4
        elif day == 'Saturday':
            return 5
        elif day == 'Sunday':
            return 6
        
    def weekend_or_weekday(day):
        if day == 'Weekend':
            return 1
        elif day == 'Weekday':
            return 0

    df_train['Day of activity'] = df_train['Day of activity'].apply(dayOfWeek)
    df_train['Weekend or weekday'] = df_train['Weekend or weekday'].apply(weekend_or_weekday)
    df_test['Day of activity'] = df_test['Day of activity'].apply(dayOfWeek)
    df_test['Weekend or weekday'] = df_test['Weekend or weekday'].apply(weekend_or_weekday)

    header = ','.join(df_train.columns)

    scaler = StandardScaler()
    scaler.fit(df_train)

    df_train = scaler.transform(df_train)
    df_test = scaler.transform(df_test)

    np.savetxt(f'df_train_{type}.csv', df_train, delimiter=',', header=header, comments='')
    np.savetxt(f'df_test_{type}.csv', df_test, delimiter=',', header=header, comments='')


def create_freq_df(df_activity):
    # Get all unique activities
    activites = df_activity['Activity'].unique()

    df_columns = [
        'Date',
        'Day of activity',
        'Weekend or weekday',
    ]

    for activity in activites:
        df_columns.append(f'Total {activity} duration')
        df_columns.append(f'Total {activity} frequency')

    # Create new dataframe
    df_freq = pd.DataFrame(columns=df_columns)

    # Convert str to datetime
    df_activity['Start time'] = pd.to_datetime(df_activity['Start time'])
    start_day = df_activity['Start time'].iloc[0].date()
    end_day = df_activity['Start time'].iloc[-1].date()

    # Iterate through each day
    for day in pd.date_range(start_day, end_day):
        date = day.date()
        day_of_week = day.day_name()
        weekend_or_weekday = 'Weekend' if day_of_week == 'Saturday' or day_of_week == 'Sunday' else 'Weekday'

        # Create new row
        new_row = {
            'Date': date,
            'Day of activity': day_of_week,
            'Weekend or weekday': weekend_or_weekday,
        }
        
        # Check if day is in dataframe
        if date not in df_activity['Start time'].dt.date.unique():
            # Add to new row
            for activity in activites:
                new_row[f'Total {activity} duration'] = 0
                new_row[f'Total {activity} frequency'] = 0

            # Concatenate new row to dataframe
            df_freq = pd.concat([df_freq, pd.DataFrame(new_row, index=[0])], ignore_index=True)
            continue

        # Iterate through each activity
        for activity in activites:
            # Filter dataframe by day and activity
            df_activity_day = df_activity[df_activity['Start time'].dt.date == date]
            df_activity_day = df_activity_day[df_activity_day['Activity'] == activity]

            # Calculate total duration and frequency
            total_duration = df_activity_day['Duration'].sum()
            total_frequency = len(df_activity_day.index)

            # Add to new row
            new_row[f'Total {activity} duration'] = total_duration
            new_row[f'Total {activity} frequency'] = total_frequency

        # Concatenate new row to dataframe
        df_freq = pd.concat([df_freq, pd.DataFrame(new_row, index=[0])], ignore_index=True)

    # Calculate average duration and frequency
    df_average = df_freq.copy()
    df_average.drop(['Date', 'Day of activity', 'Weekend or weekday'], axis=1, inplace=True)
    df_average = df_average.mean(axis=0)

    # If average frequency is < 0.25, drop activity
    for activity in activites:
        if df_average[f'Total {activity} frequency'] < 0.25:
            df_freq.drop([f'Total {activity} duration', f'Total {activity} frequency'], axis=1, inplace=True)


    # Save dataframe to csv
    df_freq.to_csv('Activity_frequency.csv', index=False)

    # Save average training day to csv
    dff_freq = df_freq[df_freq['Date'] <= pd.to_datetime('2011-07-15').date()]
    average_day = dff_freq.drop(['Date', 'Day of activity', 'Weekend or weekday'], axis=1).mean(axis=0)
    average_day = pd.DataFrame(average_day).T
    average_day.to_csv('Unscaled/average_day.csv', index=False)

    # Save average weekend day to csv
    dff_freq = df_freq[df_freq['Weekend or weekday'] == 'Weekend']
    average_day = dff_freq.drop(['Date', 'Day of activity', 'Weekend or weekday'], axis=1).mean(axis=0)
    average_day = pd.DataFrame(average_day).T
    average_day.to_csv('Unscaled/average_weekend_day.csv', index=False)

    # Save average weekday day to csv
    dff_freq = df_freq[df_freq['Weekend or weekday'] == 'Weekday']
    average_day = dff_freq.drop(['Date', 'Day of activity', 'Weekend or weekday'], axis=1).mean(axis=0)
    average_day = pd.DataFrame(average_day).T
    average_day.to_csv('Unscaled/average_weekday_day.csv', index=False)

    # Save average day of week to csv
    for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
        dff_freq = df_freq[df_freq['Day of activity'] == day]
        average_day = dff_freq.drop(['Date', 'Day of activity', 'Weekend or weekday'], axis=1).mean(axis=0)
        average_day = pd.DataFrame(average_day).T
        average_day.to_csv(f'Unscaled/average_{day}_day.csv', index=False)

    return df_freq

In [146]:
# Import data
df_activity = pd.read_csv('Activity.csv')
df_activity = df_activity[df_activity['Activity'] != 'Sleep']
df_activity = df_activity[df_activity['Activity'] != 'Other_Activity']

df_freq = create_freq_df(df_activity)

df_train, df_test = train_test_split(df_freq)

standardise(df_train, df_test, 'frequency')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice

In [140]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_day(example_day, title):
    average_day = pd.read_csv('Unscaled/average_day.csv')

    columns = example_day.columns[3:]
    activites = set([column.split(' ')[1] for column in columns])

    # Plot frequency on one axis and duration on another
    fig = make_subplots(cols=2, rows=1)

    # Iterate through each activity
    for activity in activites:
        # Get total frequency and duration
        total_frequency = example_day[f'Total {activity} frequency'].iloc[0]
        total_duration = example_day[f'Total {activity} duration'].iloc[0]

        # Get average frequency and duration
        average_frequency = average_day[f'Total {activity} frequency'].iloc[0]
        average_duration = average_day[f'Total {activity} duration'].iloc[0]

        diff_frequency = total_frequency - average_frequency
        diff_duration = total_duration - average_duration

        # Plot
        fig.add_trace(
            go.Bar(
                x=[activity],
                y=[diff_frequency],
                name=f'Total {activity} frequency',
                offsetgroup=0,
                marker_color='rgb(55, 83, 109)',
                showlegend=False,
            ),
            col=1, row=1,
        )

        fig.add_trace(
            go.Bar(
                x=[activity],
                y=[diff_duration],
                name=f'Total {activity} duration',
                offsetgroup=1,
                marker_color='rgb(26, 118, 255)',
                showlegend=False,
            ),
            col=2, row=1,
        )

    # Set layout
    fig.update_layout( barmode='group', xaxis_tickangle=-45, xaxis_title='Activity')
    # Xticks for col2
    fig.update_xaxes(tickangle=-45, col=2, row=1)

    fig.update_xaxes(title_text='Activity', col=1, row=1)
    fig.update_yaxes(title_text='Frequency', col=1, row=1)
    fig.update_yaxes(title_text='Duration', col=2, row=1)

    fig.show()

print(len(df_freq.columns))

# August 2nd
date = pd.to_datetime('2011-08-02').date()
df_freq_day = df_freq[df_freq['Date'] == date]
plot_day(df_freq_day, date)

# June 27th 2011
date = pd.to_datetime('2011-06-27').date()
df_freq_day = df_freq[df_freq['Date'] == date]
plot_day(df_freq_day, date)


51
