In [26]:
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mean, variance
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
def aggregate_attribute(attr: str, values: list) -> float:
    # number of non-zero values for numbers of calls and sms, mean for everything else
    values = [v for v in values if v == v]
    if attr == 'call' or attr == 'sms':
        return len(values)
    if len(values) == 0:
        return None
    return mean(values)

In [29]:
def get_avg_time(id: str, attr: str, offset: int, df: object) -> list:
    avg_time = []
    if attr in list(df.groupby(['id']).get_group(id).variable):
        subframe = df.groupby(['id', 'variable']).get_group((id, attr))
    else:
        return
    for i in range(len(dates) - offset):
        window_values = []
        ts_in_window = []
        for j in range(i, i + offset):
            # get timestamps in rows for corresponding dates in window
            ts_in_window.extend((subframe.loc[subframe['date'] == dates[j]].time).to_list())
        # add mean time (in seconds) in window
        if ts_in_window:
            t = pd.to_datetime(mean(ts_in_window), unit='ns').time()
            avg_time.append(t.hour * 60 + t.minute)
        else:
            avg_time.append(None)
    return avg_time

In [30]:
def create_dataframe(file_path: str) -> object:
    # read csv, split time into separate date and time colums
    data_frame = pd.read_csv(file_path, parse_dates=False)
    data_frame['date'] = pd.to_datetime(data_frame.time.str.split(' ', expand=True)[0])
    data_frame['time'] = pd.to_datetime(data_frame.time.str.split(' ', expand=True)[1])
    # time to seconds for calculations
    data_frame['time'] = pd.to_datetime(data_frame['time'], unit='s').values.astype(np.int64)
    return data_frame
    
def get_unique_column_values(column: str, dataframe: object) -> object:
    return dataframe[column].unique()

def get_aggregated_attr_in_window(usr: str, attrib: str, offset: int, dataframe: object) -> list:
    aggregated = []
    # get subtable for user+attribute
    if attrib in list(dataframe.groupby(['id']).get_group(usr).variable):
        subframe = dataframe.groupby(['id', 'variable']).get_group((usr, attrib))
    else:
        return
    # using dates from the whole dataframe
    # dates = list(set(subframe.date.tolist()))
    for i in range(len(dates) - offset):
        window_values = []
        for j in range(i, i + offset):
            # selecting attribute values for corresponding dates in window
            window_values.extend(subframe.loc[subframe['date'] == dates[j], 'value'].tolist())
        # aggregate attribute values
        aggregated.append(aggregate_attribute(attrib, window_values))
    return aggregated

def get_targets(usr: str, offset: int, dataframe: object) -> (list, list):
    # get subtable for user+"mood"
    subframe = dataframe.groupby(['id', 'variable']).get_group((usr, "mood"))
    targets = []
    weekdays = []
    for i in range(len(dates) - offset):
        # calculate target as mean mood value for the day after offset window
        target_mood = aggregate_attribute("mood", subframe.loc[subframe['date'] == dates[i + offset], 'value'].tolist())
        targets.append(target_mood)
        # collect target weekdays
        weekdays.append(dates[i + offset].weekday())
    return targets, weekdays


In [31]:
dataframe = create_dataframe("dataset_mood_smartphone.csv")
attributes = get_unique_column_values("variable", dataframe)
users = get_unique_column_values("id", dataframe)
window_size = 3
# sort all dates from dataset to iterate over them further
dates = list(set(dataframe.date.tolist()))
dates.sort()
aggregated = {}
# custom attributes to add
targets = {}
weekdays = {}

# add custom attributes column to aggregated db
attributes = np.append(attributes, 'target')
attributes = np.append(attributes, 'smstime')
attributes = np.append(attributes, 'calltime')
for i in range(7):
    attributes = np.append(attributes, 'weekday_'+str(i))

In [32]:
for ii, id in enumerate(users):
    # printing because slow
    # print(ii, len(users))
    for attr in attributes:
        aggr = get_aggregated_attr_in_window(id, attr, window_size, dataframe)
        # none check before adding to the resulting dict
        if aggr is not None:
            aggregated[(id, attr)] = aggr
        # add mean time of calls and sms to attributes
        if attr == 'call':
            aggregated[(id, 'calltime')] = get_avg_time(id, attr, window_size, dataframe)
        if attr == 'sms':
            aggregated[(id, 'smstime')] = get_avg_time(id, attr, window_size, dataframe)
    t, w = get_targets(id, window_size, dataframe)
    targets[id] = t
    weekdays[id] = w

for id in users:
    aggregated[(id, 'target')] = targets[id]
    for i in range(7):
        aggregated[(id, 'weekday_'+str(i))] = []
    for wd in weekdays[id]:
        for i in range(7):
            if i == wd:
                aggregated[(id, 'weekday_'+str(i))].append(1)
            else:
                aggregated[(id, 'weekday_'+str(i))].append(0)

In [33]:
# verify shape
lens = []
for _, values in aggregated.items():
    lens.append(len(values))
print(variance(lens), 'expected 0')

0 expected 0


In [35]:
# filling table and printing to csv
p = len(next(iter(aggregated.values())))
df_collection = [[None] * (len(attributes) + 1) for _ in range(p*len(users))]

attr_indices = {attr: i for (i,attr) in enumerate(attributes)}
user_indices = {user: i for (i,user) in enumerate(users)}

for key, values in aggregated.items():
    user_index = user_indices[key[0]]
    attr_index = attr_indices[key[1]]
    for i, value in enumerate(values):
        df_collection[user_index * p + i][0] = key[0]
        df_collection[user_index * p + i][attr_index + 1] = value
        
df = pd.DataFrame(df_collection)
df.to_csv("processed_data/aggregated_" + str(window_size) + "timefix.csv", header=['id']+attributes.tolist(), index=None)