In [265]:
%matplotlib inline

from datetime import datetime
from datetime import timedelta

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

le = preprocessing.LabelEncoder()

def convert_to_seconds_since_midnight(t):
    return (t.hour * 3600) + (t.minute * 60) + t.second + (t.microsecond / 1000000.0)

def add_time_features(df, le):
    
    df["start_time"] = df.apply(lambda x: convert_to_seconds_since_midnight(x["start_datetime"]), axis=1)
    df["end_time"] = df.apply(lambda x: convert_to_seconds_since_midnight(x["end_datetime"]), axis=1)
    df["start_day_of_week"] = df["start_datetime"].dt.dayofweek
    df["end_day_of_week"] = df["end_datetime"].dt.dayofweek
    df["duration"] = df["end_datetime"].astype(np.int64) - df["start_datetime"].astype(np.int64)
    
    return df

def add_features(df, le):
    df = add_time_features(df, le)
    
    for label in ["place", "type", "location", "activity"]:
        if label in df.columns:
            df["{label}_encoded".format(label=label)] = le.fit_transform(df[label])

    return df

def load_data(filename, le, user=None):
    data = pd.read_table(filename, header=0)
    data = data.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))
    
    print(data)
    
    data["start_datetime"] = pd.to_datetime(data["start_time"], errors='coerce')
    data["end_datetime"] = pd.to_datetime(data["end_time"], errors='coerce')
    data["user"] = user
    
    bad_rows = data[data.start_datetime > data.end_datetime]
    if not bad_rows.empty:
        data.loc[bad_rows.index,['start_datetime','end_datetime']] = data.loc[bad_rows.index,['end_datetime','start_datetime']].values
    
    data = add_features(data, le)

    return data

sensors_training = pd.concat([load_data('UserA_Sensors_training.csv', le, 'A'), load_data('UserB_Sensors_training.csv', le, 'B')])
adls_training = pd.concat([load_data('UserA_ADLs_training.csv', le, 'A'), load_data('UserB_ADLs_training.csv', le, 'B')])

sensors_test = load_data('Classify.csv', le)
adls_test = load_data('ADLS_classifying.csv', le)

def convert_to_scikit_format(df):
    X = df
    for label in ["user", "location", "type", "place", "activity", "start_datetime", "end_datetime"]:
        if label in X.columns:
            X = X.drop(label, axis=1)
    y = df["user"]
    
    return X, y

sensor_X_train, sensor_y_train = convert_to_scikit_format(sensors_training)
sensor_X_test, _ = convert_to_scikit_format(sensors_test)

adls_X_train, adls_y_train = convert_to_scikit_format(adls_training)
adls_X_test, _ = convert_to_scikit_format(adls_test)

               start_time             end_time   location      type     place
0     2011-11-28 02:27:59  2011-11-28 10:18:11        Bed  Pressure   Bedroom
1     2011-11-28 10:21:24  2011-11-28 10:21:31    Cabinet  Magnetic  Bathroom
2     2011-11-28 10:21:44  2011-11-28 10:23:31      Basin       PIR  Bathroom
3     2011-11-28 10:23:02  2011-11-28 10:23:36     Toilet     Flush  Bathroom
4     2011-11-28 10:25:44  2011-11-28 10:32:06     Shower       PIR  Bathroom
5     2011-11-28 10:34:23  2011-11-28 10:34:41     Fridge  Magnetic   Kitchen
6     2011-11-28 10:34:44  2011-11-28 10:37:17   Cupboard  Magnetic   Kitchen
7     2011-11-28 10:38:00  2011-11-28 10:42:41    Toaster  Electric   Kitchen
8     2011-11-28 10:38:33  2011-11-28 10:38:40     Fridge  Magnetic   Kitchen
9     2011-11-28 10:41:29  2011-11-28 10:41:36   Cupboard  Magnetic   Kitchen
10    2011-11-28 10:41:43  2011-11-28 10:41:59    Cooktop       PIR   Kitchen
11    2011-11-28 10:41:59  2011-11-28 10:42:55  Microwave  Elect

In [266]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

sensor_classifier = RandomForestClassifier()
sensor_classifier.fit(sensor_X_train, sensor_y_train)

sensors_test['predicted_user'] = sensor_classifier.predict(sensor_X_test)
pd.options.display.max_rows = 300

adls_classifier = RandomForestClassifier()
adls_classifier.fit(adls_X_train, adls_y_train)
adls_test['predicted_user'] = adls_classifier.predict(adls_X_test)

In [267]:
sensors_test_a = sensors_test[sensors_test['predicted_user'] == 'A']
adls_test_a = adls_test[adls_test['predicted_user'] == 'A']

In [268]:
sensors_test_a = sensors_test_a.sort_values("start_datetime", ascending=True)
adls_test_a = adls_test_a.sort_values("start_datetime", ascending=True)

In [269]:
sensors_test_a['time_gap'] = np.where(sensors_test_a['end_datetime'].shift(1) > sensors_test_a['start_datetime'], 0, (sensors_test_a['start_datetime'] - sensors_test_a['end_datetime'].shift(1)).dt.seconds)
adls_test_a['time_gap'] = np.where(adls_test_a['end_datetime'].shift(1) > adls_test_a['start_datetime'], 0, (adls_test_a['start_datetime'] - adls_test_a['end_datetime'].shift(1)).dt.seconds)

In [270]:
sensors_test_a.to_csv("imputed_sensors_test_a_copy.csv")
adls_test_a.to_csv("adls_test_a_copy.csv")

In [271]:
def load_augmented_data(filename, le, user=None):
    data = pd.read_csv(filename, header=0)
    data = data.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))
    
    data["start_datetime"] = pd.to_datetime(data["start_time"], errors='coerce')
    data["end_datetime"] = pd.to_datetime(data["end_time"], errors='coerce')
    data["user"] = user
    
    bad_rows = data[data.start_datetime > data.end_datetime]
    if not bad_rows.empty:
        data.loc[bad_rows.index,['start_datetime','end_datetime']] = data.loc[bad_rows.index,['end_datetime','start_datetime']].values
    
    data = add_time_features(data, le)

    return data

imputed_sensors_test_a = load_augmented_data('imputed_sensors_test_a_copy.csv', le, 'A')
imputed_adls_test_a = load_augmented_data('adls_test_a_copy.csv', le, 'A')

In [272]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing

def get_knn_imputer(df, label):
    le = preprocessing.LabelEncoder()
    knn_training = df
    
    predicted_labels = ["place", "type", "location", "activity"]
    
    for l in predicted_labels:
        if l in df.columns:
            knn_training = knn_training.dropna(subset=[l])
    
    knn_X = drop_irrelevant_columns(knn_training)
    knn_y = le.fit_transform(knn_training[label])
    nbrs = KNeighborsRegressor().fit(knn_X, knn_y)
    return nbrs, le

def drop_irrelevant_columns(row):
    predicted_labels = ["place", "type", "location", "activity"]
    for l in predicted_labels:
        if l in row.columns:
            row = row.drop([l], axis=1)
            
    row = row.drop(["predicted_user", "start_datetime", "end_datetime", "user"], axis=1)

    return row

def impute_value(x, le):
    imputation = knn_imputer.predict(x[["start_time", "end_time", "start_day_of_week", "end_day_of_week", "duration"]])
    return le.inverse_transform(imputation.astype(int))[0]
    
for label in ["location", "type", "place"]:
    knn_imputer, le = get_knn_imputer(imputed_sensors_test_a, label)  
    imputed_sensors_test_a[label] = imputed_sensors_test_a.apply(lambda x: impute_value(x, le), axis=1)

for label in ["activity"]:
    knn_imputer, le = get_knn_imputer(imputed_adls_test_a, label)  
    imputed_adls_test_a[label] = imputed_adls_test_a.apply(lambda x: impute_value(x, le), axis=1)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [263]:
imputed_sensors_test_a.to_csv("imputed_sensors_test_a_filled.csv")
imputed_adls_test_a.to_csv("adls_test_a_filled.csv")

In [239]:
imputed_sensors_test_a.iloc[[8]]

Unnamed: 0,location,type,place,start_time,end_time,predicted_user,start_datetime,end_datetime,user,start_day_of_week,end_day_of_week,duration
8,Seat,Pressure,Living,53220.0,55440.0,,2011-12-02 14:47:00,2011-12-02 15:24:00,A,4,4,2220000000000
