# Classification 10/20/22

* Goal: tried classifying td_ambient_1020222 using sktime module, specificaly classify based on differences in temp / rh 
* Update 11/10/22: introduce functions, start classification with sklearn bc clearer error messages and better documentation 
* Update 11/15/22: plotting performance of classifiers..



In [1]:
import json
import datetime
import pandas as pd
import numpy as np
import sys
import os

import plotly.express as px
import plotly.graph_objects as go


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from scipy.stats import sem




In [2]:
df = pd.read_csv("../constants/td_ambient_102022.csv", )
df

Unnamed: 0,DateTime,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open
0,2022-07-20 07:20:00,23.189,51.268,0,14.333333,30.555556,1
1,2022-07-20 07:20:30,23.189,51.362,0,14.333333,30.555556,1
2,2022-07-20 07:21:00,23.189,51.488,0,14.333333,30.555556,1
3,2022-07-20 07:21:30,23.189,51.457,0,14.333333,30.555556,1
4,2022-07-20 07:22:00,23.165,51.517,0,14.333333,30.555556,1
...,...,...,...,...,...,...,...
123601,2022-09-11 07:58:00,29.196,50.861,1,19.611111,30.000000,0
123602,2022-09-11 07:58:30,29.196,50.861,1,19.611111,30.000000,0
123603,2022-09-11 07:59:00,29.196,50.796,1,19.611111,30.000000,0
123604,2022-09-11 07:59:30,29.196,50.796,1,19.611111,30.000000,0


In [3]:
# add temperature and rh deltas for each observation 
df["T_Delta"] =  df["Temp C"] - df["Ambient Temp"]
df["RH_Delta"] =  df["RH %"]- df["Ambient RH"]


In [4]:
df.head()

Unnamed: 0,DateTime,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
0,2022-07-20 07:20:00,23.189,51.268,0,14.333333,30.555556,1,8.855667,20.712444
1,2022-07-20 07:20:30,23.189,51.362,0,14.333333,30.555556,1,8.855667,20.806444
2,2022-07-20 07:21:00,23.189,51.488,0,14.333333,30.555556,1,8.855667,20.932444
3,2022-07-20 07:21:30,23.189,51.457,0,14.333333,30.555556,1,8.855667,20.901444
4,2022-07-20 07:22:00,23.165,51.517,0,14.333333,30.555556,1,8.831667,20.961444


# Grouping and Statistics 

In [5]:
# do some grouping 

# group by window open vs window closed -> where are the deltas greater 
df.drop("DateTime", axis=1).groupby(["Window Open"]).agg(["mean", "std"]).loc[:,["T_Delta", "RH_Delta"]]

Unnamed: 0_level_0,T_Delta,T_Delta,RH_Delta,RH_Delta
Unnamed: 0_level_1,mean,std,mean,std
Window Open,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5.752171,4.206669,30.640394,8.547424
1,6.257008,3.594757,28.279236,7.846737


In [6]:
# group by rooms -> where are the deltas greater 
df.drop("DateTime", axis=1).groupby(["Room"]).agg(["mean", "std"]).loc[:,["T_Delta", "RH_Delta"]]

Unnamed: 0_level_0,T_Delta,T_Delta,RH_Delta,RH_Delta
Unnamed: 0_level_1,mean,std,mean,std
Room,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5.729435,3.780207,29.178837,8.143704
1,6.301897,4.010265,29.637181,8.395411


In [7]:
# group by window open vs window closed and rooms -> where are the deltas greater 
df.drop("DateTime", axis=1).groupby(["Window Open", "Room"]).agg(["mean", "std"]).loc[:,["T_Delta", "RH_Delta"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,T_Delta,T_Delta,RH_Delta,RH_Delta
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
Window Open,Room,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,0,1.603624,4.786734,37.193811,7.443892
0,1,6.059046,3.992906,30.155626,8.423356
1,0,6.020292,3.521054,28.613806,7.889349
1,1,8.272099,3.584829,25.431145,6.841461


# Group data at various time intervals 

## hourly 

In [8]:
times = pd.to_datetime(df.DateTime)

In [9]:
# aggregate data based on hour and room 
df_hour = df.groupby([times.dt.date, times.dt.hour, "Room"]).mean()
df_hour.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Temp C,RH %,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
DateTime,DateTime,Room,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-07-20,7,0,23.3264,51.622238,14.333333,30.555556,1.0,8.993067,21.066682
2022-07-20,7,1,23.4221,52.867775,14.333333,30.555556,0.2375,9.088767,22.312219
2022-07-20,8,0,23.736167,50.783575,14.666667,30.555556,1.0,9.0695,20.228019
2022-07-20,8,1,23.740725,52.640442,14.666667,30.555556,1.0,9.074058,22.084886
2022-07-20,9,0,24.307792,50.038683,15.388889,28.888889,1.0,8.918903,21.149794


## half-hourly

In [10]:
def make_df_with_freq(df, freq):
    dftime = df.copy()

    # transform datetime index to actual pandas datetime, and set as index of dataframe 
    dftime["DateTime"] = pd.to_datetime(dftime["DateTime"])
    dftime.set_index("DateTime", inplace=True)

    # resample the dataframe to group by the correct index and drop nans
    dftime_freq = dftime.groupby("Room").resample(freq).mean()
    dftime_freq.dropna(inplace=True)

    # map window variables to 0 or 1
    dftime_freq["Window Open"] = dftime_freq["Window Open"].round()

    return dftime_freq


In [11]:
df_30min = make_df_with_freq(df, "30T")
df_15min = make_df_with_freq(df, "15T")
df_15min = make_df_with_freq(df, "15T")
df_30min

Unnamed: 0_level_0,Unnamed: 1_level_0,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
Room,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2022-07-20 07:00:00,23.180600,51.421100,0.0,14.333333,30.555556,1.0,8.847267,20.865544
0,2022-07-20 07:30:00,23.375000,51.689283,0.0,14.333333,30.555556,1.0,9.041667,21.133728
0,2022-07-20 08:00:00,23.581400,51.084233,0.0,14.666667,30.555556,1.0,8.914733,20.528678
0,2022-07-20 08:30:00,23.890933,50.482917,0.0,14.666667,30.555556,1.0,9.224267,19.927361
0,2022-07-20 09:00:00,24.194967,50.202333,0.0,15.388889,28.888889,1.0,8.806078,21.313444
...,...,...,...,...,...,...,...,...,...
1,2022-09-11 06:00:00,29.271000,50.757817,1.0,19.500000,30.000000,0.0,9.771000,20.757817
1,2022-09-11 06:30:00,29.255583,50.776800,1.0,19.500000,30.000000,0.0,9.755583,20.776800
1,2022-09-11 07:00:00,29.241833,50.811517,1.0,19.611111,30.000000,0.0,9.630722,20.811517
1,2022-09-11 07:30:00,29.219333,50.808100,1.0,19.611111,30.000000,0.0,9.608222,20.808100


In [12]:
freqs = ["60T", "30T", "15T", "10T", "5T", "1T", "30s"]
df_freq_dict = {}
for freq in freqs:
    df_freq_dict[freq] = make_df_with_freq(df, freq)

## create data for classification 

In [27]:
def make_univ_classif_data(df, x_var, y_var, seed=42):
    """make univariate classification data, split data into train and test and assert they have the right shapes.. """
    x = df.loc[:, [x_var]]
    y = df.loc[:, y_var]

    # convert from pandas series to numpy and reshape 
    xnp = x.to_numpy().flatten().reshape(-1, 1)
    ynp = y.to_numpy()
    # turn into categorical variable TODO -> better with np masks.... 
    ynp = np.array([0 if i < 0.5 else 1 for i in ynp ])

    # assert that shapes are as needed for processing 
    assert(xnp.shape == (len(xnp),1))
    assert(ynp.shape == (len(ynp),))

    # split into test and training groups 
    x_train, x_test, y_train, y_test = train_test_split(xnp, ynp,test_size=0.2, random_state=seed)

    data_dict = {
        "x_train": x_train,
        "x_test": x_test,
        "x_all": xnp,
        "y_train": y_train,
        "y_test": y_test,
        "y_train": y_train,
        "y_all": ynp
    }

    return data_dict



### sklearn logistic regression 

In [33]:
def run_log_reg(data):
    "runs logistic regression and note performance, taking in dictionary of required data "
    # TODO -> assertions that data is in correct format 
    logReg = LogisticRegression()
    logReg.fit(data["x_train"], data["y_train"])
    score = logReg.score(data["x_test"], data["y_test"])
    return score 

def run_cv_log_reg(data):
    "runs logistic regression and note cross-validated performance, taking in dictionary of required data "
    # TODO -> assertions that data is in correct format 
    logReg = LogisticRegression()
    logReg.fit(data["x_all"], data["y_all"])
    # perform cross validation, 
    # TODO make sure this is doing the right thing????
    scores = cross_val_score(logReg, data["x_all"], data["y_all"], cv=5)
    # return mean and confidence interval (assuming normal distribution)
    return (np.round(np.mean(scores), 3), np.round(1.96*sem(scores),3))

In [29]:
d = make_univ_classif_data(df_freq_dict["30T"], "Temp C", "Window Open")
scores = run_cv_log_reg(d)
scores

(0.591, 0.0588)

In [30]:
def see_scores(seed=42):
    features = ['Temp C', 'RH %', 'Ambient Temp', 'Ambient RH', 'T_Delta', 'RH_Delta']
    df_freq_scores = {}
    for k,v in df_freq_dict.items():
        feature_scores = {}
        for feature in features:
            d = make_univ_classif_data(v, feature, "Window Open", seed)
            score = run_log_reg(d)
            feature_scores[feature] = np.round(score,3)
        df_freq_scores[k] = feature_scores

    return pd.DataFrame(df_freq_scores)

## cross validated scores of models trained with different features and at various frequency intervals 

In [31]:
def see_cv_scores():
    features = ['Temp C', 'RH %', 'Ambient Temp', 'Ambient RH', 'T_Delta', 'RH_Delta']
    df_freq_scores = {}
    for k,v in df_freq_dict.items():
        feature_scores = {}
        for feature in features:
            d = make_univ_classif_data(v, feature, "Window Open")
            score_stats = run_cv_log_reg(d)
            feature_scores[feature] = score_stats
        df_freq_scores[k] = feature_scores

    return pd.DataFrame(df_freq_scores)

In [34]:
a = see_cv_scores() 
a

Unnamed: 0,60T,30T,15T,10T,5T,1T,30s
Temp C,"(0.591, 0.065)","(0.591, 0.06)","(0.593, 0.059)","(0.592, 0.059)","(0.592, 0.058)","(0.592, 0.059)","(0.592, 0.06)"
RH %,"(0.509, 0.016)","(0.504, 0.017)","(0.503, 0.021)","(0.506, 0.015)","(0.506, 0.015)","(0.507, 0.014)","(0.507, 0.014)"
Ambient Temp,"(0.555, 0.049)","(0.554, 0.046)","(0.554, 0.046)","(0.554, 0.045)","(0.554, 0.046)","(0.554, 0.046)","(0.554, 0.046)"
Ambient RH,"(0.544, 0.033)","(0.545, 0.035)","(0.544, 0.032)","(0.544, 0.031)","(0.544, 0.031)","(0.544, 0.031)","(0.543, 0.032)"
T_Delta,"(0.528, 0.033)","(0.523, 0.031)","(0.522, 0.033)","(0.522, 0.032)","(0.523, 0.033)","(0.523, 0.033)","(0.524, 0.032)"
RH_Delta,"(0.547, 0.039)","(0.547, 0.04)","(0.546, 0.04)","(0.547, 0.04)","(0.546, 0.039)","(0.547, 0.04)","(0.547, 0.039)"


## plot performance of temperature feature across frequencies 

In [19]:
def see_freq_perf():
    a = see_cv_scores()
    means = [i[0] for i in a.loc["Temp C", :]]
    error = [i[1] for i in a.loc["Temp C", :]]

    x = ["1hr", "30mins", "15mins", "10mins", "5mins", "1min", "30sec"]
    y = means
    y_upper = [a+b for a,b in zip(means, error)]
    y_lower = [a-b for a,b in zip(means, error)]

    fig = go.Figure([
        go.Scatter(
            name='Average Score - 5 Fold Cross Validation',
            x=x,
            y=y,
            mode='lines',
            line=dict(color='rgb(31, 119, 180)'),
        ),
        go.Scatter(
            name='Upper Bound',
            x=x,
            y=y_upper,
            mode='lines',
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False
        ),
        go.Scatter(
            name='Lower Bound',
            x=x,
            y=y_lower,
            marker=dict(color="#444"),
            line=dict(width=0),
            mode='lines',
            fillcolor='rgba(68, 68, 68, 0.3)',
            fill='tonexty',
            showlegend=False
        )
    ])
    fig.update_layout(
        yaxis_title='Model Accuracy',
        title=f'Logistic Regression Performance for Various Time Intervals',
        hovermode="x"
    )
    fig.show()

In [35]:
see_freq_perf()

# see peformance of model..

In [36]:
df.head()

Unnamed: 0,DateTime,Temp C,RH %,Room,Ambient Temp,Ambient RH,Window Open,T_Delta,RH_Delta
0,2022-07-20 07:20:00,23.189,51.268,0,14.333333,30.555556,1,8.855667,20.712444
1,2022-07-20 07:20:30,23.189,51.362,0,14.333333,30.555556,1,8.855667,20.806444
2,2022-07-20 07:21:00,23.189,51.488,0,14.333333,30.555556,1,8.855667,20.932444
3,2022-07-20 07:21:30,23.189,51.457,0,14.333333,30.555556,1,8.855667,20.901444
4,2022-07-20 07:22:00,23.165,51.517,0,14.333333,30.555556,1,8.831667,20.961444


In [None]:
# for all time, plot temperature. then plot window open or closed (in seperate trace). seperate graphs for room 0 vs 1