# Classification 10/20/22

Goal: Start classifying tidydata_1020222 using sktime module, specificaly classify based on differences in temp / rh 
Update 11/10/22: introduce functions, start classification with sklearn bc clearer error messages and better documentation 



In [133]:
import json
import datetime
import pandas as pd
import numpy as np
import sys
import os
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("../constants/td_ambient_102022.csv", )
df

Unnamed: 0,DateTime,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open
0,2022-07-20 07:20:00,23.189,51.268,0,14.333333,30.555556,1
1,2022-07-20 07:20:30,23.189,51.362,0,14.333333,30.555556,1
2,2022-07-20 07:21:00,23.189,51.488,0,14.333333,30.555556,1
3,2022-07-20 07:21:30,23.189,51.457,0,14.333333,30.555556,1
4,2022-07-20 07:22:00,23.165,51.517,0,14.333333,30.555556,1
...,...,...,...,...,...,...,...
123601,2022-09-11 07:58:00,29.196,50.861,1,19.611111,30.000000,0
123602,2022-09-11 07:58:30,29.196,50.861,1,19.611111,30.000000,0
123603,2022-09-11 07:59:00,29.196,50.796,1,19.611111,30.000000,0
123604,2022-09-11 07:59:30,29.196,50.796,1,19.611111,30.000000,0


In [3]:
# add temperature and rh deltas for each observation 
df["T_Delta"] =  df["Temp C"] - df["Ambient Temp"]
df["RH_Delta"] =  df["RH %"]- df["Ambient RH"]


In [4]:
df.head()

Unnamed: 0,DateTime,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
0,2022-07-20 07:20:00,23.189,51.268,0,14.333333,30.555556,1,8.855667,20.712444
1,2022-07-20 07:20:30,23.189,51.362,0,14.333333,30.555556,1,8.855667,20.806444
2,2022-07-20 07:21:00,23.189,51.488,0,14.333333,30.555556,1,8.855667,20.932444
3,2022-07-20 07:21:30,23.189,51.457,0,14.333333,30.555556,1,8.855667,20.901444
4,2022-07-20 07:22:00,23.165,51.517,0,14.333333,30.555556,1,8.831667,20.961444


# Grouping and Statistics 

In [5]:
# do some grouping 

# group by window open vs window closed -> where are the deltas greater 
df.drop("DateTime", axis=1).groupby(["Window Open"]).agg(["mean", "std"]).loc[:,["T_Delta", "RH_Delta"]]

Unnamed: 0_level_0,T_Delta,T_Delta,RH_Delta,RH_Delta
Unnamed: 0_level_1,mean,std,mean,std
Window Open,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5.752171,4.206669,30.640394,8.547424
1,6.257008,3.594757,28.279236,7.846737


In [6]:
# group by rooms -> where are the deltas greater 
df.drop("DateTime", axis=1).groupby(["Room"]).agg(["mean", "std"]).loc[:,["T_Delta", "RH_Delta"]]

Unnamed: 0_level_0,T_Delta,T_Delta,RH_Delta,RH_Delta
Unnamed: 0_level_1,mean,std,mean,std
Room,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5.729435,3.780207,29.178837,8.143704
1,6.301897,4.010265,29.637181,8.395411


In [7]:
# group by window open vs window closed and rooms -> where are the deltas greater 
df.drop("DateTime", axis=1).groupby(["Window Open", "Room"]).agg(["mean", "std"]).loc[:,["T_Delta", "RH_Delta"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,T_Delta,T_Delta,RH_Delta,RH_Delta
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
Window Open,Room,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,0,1.603624,4.786734,37.193811,7.443892
0,1,6.059046,3.992906,30.155626,8.423356
1,0,6.020292,3.521054,28.613806,7.889349
1,1,8.272099,3.584829,25.431145,6.841461


# Group data at various time intervals 

## hourly 

In [8]:
times = pd.to_datetime(df.DateTime)

In [9]:
# aggregate data based on hour and room 
df_hour = df.groupby([times.dt.date, times.dt.hour, "Room"]).mean()
df_hour.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Temp C,RH %,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
DateTime,DateTime,Room,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-07-20,7,0,23.3264,51.622238,14.333333,30.555556,1.0,8.993067,21.066682
2022-07-20,7,1,23.4221,52.867775,14.333333,30.555556,0.2375,9.088767,22.312219
2022-07-20,8,0,23.736167,50.783575,14.666667,30.555556,1.0,9.0695,20.228019
2022-07-20,8,1,23.740725,52.640442,14.666667,30.555556,1.0,9.074058,22.084886
2022-07-20,9,0,24.307792,50.038683,15.388889,28.888889,1.0,8.918903,21.149794


## half-hourly

In [None]:
def make_df_with_freq(df, freq):
    dftime = df.copy()

    # transform datetime index to actual pandas datetime, and set as index of dataframe 
    dftime["DateTime"] = pd.to_datetime(dftime["DateTime"])
    dftime.set_index("DateTime", inplace=True)

    # resample the dataframe to group by the correct index and drop nans
    dftime_freq = dftime.groupby("Room").resample(freq).mean()
    dftime_freq.dropna(inplace=True)

    # map window variables to 0 or 1
    dftime_freq["Window Open"] = dftime_freq["Window Open"].round()

    return dftime_freq


In [147]:
dftime = df.copy()

In [148]:
dftime["DateTime"] = pd.to_datetime(dftime["DateTime"])

In [110]:
dftime

Unnamed: 0,DateTime,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
0,2022-07-20 07:20:00,23.189,51.268,0,14.333333,30.555556,1,8.855667,20.712444
1,2022-07-20 07:20:30,23.189,51.362,0,14.333333,30.555556,1,8.855667,20.806444
2,2022-07-20 07:21:00,23.189,51.488,0,14.333333,30.555556,1,8.855667,20.932444
3,2022-07-20 07:21:30,23.189,51.457,0,14.333333,30.555556,1,8.855667,20.901444
4,2022-07-20 07:22:00,23.165,51.517,0,14.333333,30.555556,1,8.831667,20.961444
...,...,...,...,...,...,...,...,...,...
123601,2022-09-11 07:58:00,29.196,50.861,1,19.611111,30.000000,0,9.584889,20.861000
123602,2022-09-11 07:58:30,29.196,50.861,1,19.611111,30.000000,0,9.584889,20.861000
123603,2022-09-11 07:59:00,29.196,50.796,1,19.611111,30.000000,0,9.584889,20.796000
123604,2022-09-11 07:59:30,29.196,50.796,1,19.611111,30.000000,0,9.584889,20.796000


In [149]:
dftime.set_index("DateTime", inplace=True)

In [150]:
dftime_30 = dftime.groupby("Room").resample("30T").mean()
dftime_30.dropna(inplace=True)

In [None]:
# observe frequency of window openings that are in between 0 and 1
dftime_30["Window Open"]

In [151]:
px.histogram(dftime_30, x="Window Open")

In [152]:
# map variables to 0 or 1... => this could be source of error...
dftime_30["Window Open"] = dftime_30["Window Open"].round()

In [153]:
px.histogram(dftime_30, x="Window Open")

In [154]:
dftime_30[dftime_30["Window Open"].isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
Room,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [155]:
px.scatter(dftime_30, x=dftime_30.index.get_level_values(1), y="Temp C")

## create data for classification 

In [67]:
def make_univ_classif_data(df, x_var, y_var):
    """make univariate classification data, split data into train and test and assert they have the right shapes.. """
    x = df.loc[:, [x_var]]
    y = df.loc[:, y_var]

    # convert from pandas series to numpy and reshape 
    xnp = x.to_numpy().flatten().reshape(-1, 1)
    ynp = y.to_numpy()
    # turn into categorical variable TODO -> better with np masks.... 
    ynp = np.array([0 if i < 0.5 else 1 for i in ynp ])

    # assert that shapes are as needed for processing 
    assert(xnp.shape == (len(xnp),1))
    assert(ynp.shape == (len(ynp),))

    # split into test and training groups 
    x_train, x_test, y_train, y_test = train_test_split(xnp, ynp,test_size=0.2, random_state=42)

    data_dict = {
        "x_train": x_train,
        "x_test": x_test,
        "y_train": y_train,
        "y_test": y_test
    }

    return data_dict
    

### sklearn logistic regression 

In [69]:
def run_log_reg(data):
    " runs regression and note performance, taking in dictionary of required data "
    # TODO -> assertions that data is in correct format 
    logReg = LogisticRegression()
    logReg.fit(data["x_train"], data["y_train"])
    score = logReg.score(data["x_test"], data["y_test"])
    return score 

In [78]:
features = ['Temp C', 'RH %', 'Ambient Temp', 'Ambient RH', 'T_Delta', 'RH_Delta']

feature_scores = {}

for feature in features:
    d = make_univ_classif_data(df_hour, feature, "Window Open")
    score = run_log_reg(d)
    feature_scores[feature] = np.round(score,3)

pd.DataFrame(feature_scores, index=["scores"]).T

{'Temp C': 0.668,
 'RH %': 0.611,
 'Ambient Temp': 0.572,
 'Ambient RH': 0.567,
 'T_Delta': 0.553,
 'RH_Delta': 0.562}