In [175]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from joblib import dump

import numpy as np



df1 = pd.read_parquet("validated_data/cleaned1.parquet")
df2 = pd.read_parquet("validated_data/cleaned2.parquet")
df3 = pd.read_parquet("validated_data/cleaned3.parquet")



In [176]:
# Data Cleanings
drop_columns_common = ["building_no", "Fan_on_group", "Cumulative_fan_on_mins", "Date", "Time", "Year"]
drop_columns_df1 = drop_columns_common + ["Slab_temp", "Dew_temp", "Slab_temp_diff", "Dew_temp_diff", "Damper_open_group"]
drop_columns_df2 = drop_columns_common  # Adjusted for columns only in df2
drop_columns_df3 = drop_columns_common + ["Damper_open_group", "Louver_open_group", "Cumulative_louver_open_mins"]

#Conditional Dropping = creates a list of columns to drop
df1 = df1.drop(columns=[col for col in drop_columns_df1 if col in df1.columns]) # Checks to see if the column is actually there first.
df2 = df2.drop(columns=[col for col in drop_columns_df2 if col in df2.columns]) 
df3 = df3.drop(columns=[col for col in drop_columns_df3 if col in df3.columns])

dfs = [df1, df2, df3]

In [177]:
print("Columns in df1:", df1.columns)
print("Columns in df2:", df2.columns)
print("Columns in df3:", df3.columns)



Columns in df1: Index(['Zone_name', 'Datetime', 'Zone_temp', 'Ambient_temp', 'Damper_status',
       'Fan_status', 'Datetime_diff_mins', 'Month', 'Day', 'DOW', 'Season',
       'Zone_temp_diff', 'Ambient_temp_diff', 'Cumulative_damper_open_mins',
       'Faulty'],
      dtype='object')
Columns in df2: Index(['Zone_name', 'Datetime', 'Zone_temp', 'Slab_temp', 'Dew_temp',
       'Ambient_temp', 'Zone_c02', 'Fan_status', 'Datetime_diff_mins', 'Month',
       'Day', 'DOW', 'Season', 'Zone_temp_diff', 'Slab_temp_diff',
       'Dew_temp_diff', 'Ambient_temp_diff', 'Faulty'],
      dtype='object')
Columns in df3: Index(['Zone_name', 'Datetime', 'Zone_temp', 'Slab_temp', 'Dew_temp',
       'Ambient_temp', 'Zone_c02', 'Damper_status', 'Louver_status',
       'Fan_status', 'Datetime_diff_mins', 'Month', 'Day', 'DOW', 'Season',
       'Zone_temp_diff', 'Slab_temp_diff', 'Dew_temp_diff',
       'Ambient_temp_diff', 'Cumulative_damper_open_mins', 'Faulty'],
      dtype='object')


In [178]:
print("Columns in df1 after dropping:")
print(df1.columns)

print("\nColumns in df2 after dropping:")
print(df2.columns)

print("\nColumns in df3 after dropping:")
print(df3.columns)

Columns in df1 after dropping:
Index(['Zone_name', 'Datetime', 'Zone_temp', 'Ambient_temp', 'Damper_status',
       'Fan_status', 'Datetime_diff_mins', 'Month', 'Day', 'DOW', 'Season',
       'Zone_temp_diff', 'Ambient_temp_diff', 'Cumulative_damper_open_mins',
       'Faulty'],
      dtype='object')

Columns in df2 after dropping:
Index(['Zone_name', 'Datetime', 'Zone_temp', 'Slab_temp', 'Dew_temp',
       'Ambient_temp', 'Zone_c02', 'Fan_status', 'Datetime_diff_mins', 'Month',
       'Day', 'DOW', 'Season', 'Zone_temp_diff', 'Slab_temp_diff',
       'Dew_temp_diff', 'Ambient_temp_diff', 'Faulty'],
      dtype='object')

Columns in df3 after dropping:
Index(['Zone_name', 'Datetime', 'Zone_temp', 'Slab_temp', 'Dew_temp',
       'Ambient_temp', 'Zone_c02', 'Damper_status', 'Louver_status',
       'Fan_status', 'Datetime_diff_mins', 'Month', 'Day', 'DOW', 'Season',
       'Zone_temp_diff', 'Slab_temp_diff', 'Dew_temp_diff',
       'Ambient_temp_diff', 'Cumulative_damper_open_mins', 'Faul

In [270]:
# One-Hot Encoding 
df3 = pd.get_dummies(df3, columns=['Zone_name', 'Damper_status', 'DOW', 'Louver_status', 'Season']) # Converts columns into one-hot encoded columns
df1 = pd.get_dummies(df1, columns=['Zone_name', 'DOW', 'Damper_status', 'Season']) # Zone_name becomes Zone_name_Zone1, Zone_name_Zone2  from the three categories
df2 = pd.get_dummies(df2, columns=['Zone_name', 'DOW', 'Season']) 

KeyError: "None of [Index(['Zone_name', 'Damper_status', 'DOW', 'Louver_status', 'Season'], dtype='object')] are in the [columns]"

In [180]:
print("Columns in df3 after encoding:")
print(df3.columns)



Columns in df3 after encoding:
Index(['Datetime', 'Zone_temp', 'Slab_temp', 'Dew_temp', 'Ambient_temp',
       'Zone_c02', 'Fan_status', 'Datetime_diff_mins', 'Month', 'Day',
       ...
       'DOW_3', 'DOW_4', 'DOW_5', 'DOW_6', 'Louver_status_Close',
       'Louver_status_Open', 'Season_1', 'Season_2', 'Season_3', 'Season_4'],
      dtype='object', length=706)


In [181]:
print("\nFirst few rows of df3 after encoding:")
print(df3.head())



First few rows of df3 after encoding:
             Datetime  Zone_temp  Slab_temp  Dew_temp  Ambient_temp  Zone_c02  \
0 2023-01-01 00:02:41        NaN        NaN       NaN          21.4       NaN   
1 2023-01-01 00:13:52        NaN        NaN       NaN          21.0       NaN   
2 2023-01-01 00:25:03        NaN        NaN       NaN          20.7       NaN   
3 2023-01-01 00:36:14        NaN        NaN       NaN          20.4       NaN   
4 2023-01-01 00:47:20        NaN        NaN       NaN          20.1       NaN   

  Fan_status  Datetime_diff_mins  Month  Day  ...  DOW_3  DOW_4  DOW_5  DOW_6  \
0       None                 NaN      1    1  ...  False  False  False   True   
1       None                11.0      1    1  ...  False  False  False   True   
2       None                11.0      1    1  ...  False  False  False   True   
3       None                11.0      1    1  ...  False  False  False   True   
4       None                11.0      1    1  ...  False  False  Fals

In [182]:
print("\nShape of df3 after encoding:", df3.shape)


Shape of df3 after encoding: (1942515, 706)


In [183]:
# Check for columns related to Zone_name to check Hot Encoding
zone_name_columns = [col for col in df3.columns if col.startswith('Zone_name_')]
print("Columns related to Zone_name after encoding:", zone_name_columns)


Columns related to Zone_name after encoding: ['Zone_name_Ahu-G-01', 'Zone_name_G-01', 'Zone_name_G-02', 'Zone_name_G-03', 'Zone_name_G-04', 'Zone_name_G-05', 'Zone_name_G-06', 'Zone_name_G-07', 'Zone_name_G-08', 'Zone_name_L1-01', 'Zone_name_L1-02', 'Zone_name_L1-03', 'Zone_name_L1-04', 'Zone_name_L1-05', 'Zone_name_L1-06', 'Zone_name_L1-07', 'Zone_name_R-01', 'Zone_name_R-02', 'Zone_name_R-03', 'Zone_name_Zone-1', 'Zone_name_Zone-10', 'Zone_name_Zone-11', 'Zone_name_Zone-12', 'Zone_name_Zone-13', 'Zone_name_Zone-14', 'Zone_name_Zone-15', 'Zone_name_Zone-16', 'Zone_name_Zone-17', 'Zone_name_Zone-18', 'Zone_name_Zone-19', 'Zone_name_Zone-2', 'Zone_name_Zone-20', 'Zone_name_Zone-21', 'Zone_name_Zone-22', 'Zone_name_Zone-23', 'Zone_name_Zone-24', 'Zone_name_Zone-25', 'Zone_name_Zone-26', 'Zone_name_Zone-27', 'Zone_name_Zone-28', 'Zone_name_Zone-3', 'Zone_name_Zone-4', 'Zone_name_Zone-5', 'Zone_name_Zone-6', 'Zone_name_Zone-7', 'Zone_name_Zone-8', 'Zone_name_Zone-9']


In [184]:
# Converting Datetime into datetime objects using pandas

df1['Datetime'] = pd.to_datetime(df1['Datetime'])
df2['Datetime'] = pd.to_datetime(df2['Datetime'])
df3['Datetime'] = pd.to_datetime(df3['Datetime'])


In [185]:
# Extracting DOY from the Datetime Column; creates new column Day_of_year in each DF

df1['Day_of_Year'] = df1['Datetime'].dt.dayofyear
df2['Day_of_Year'] = df2['Datetime'].dt.dayofyear
df3['Day_of_Year'] = df3['Datetime'].dt.dayofyear

In [186]:
# creates new col Minutes_Past_midnight in all df that represents number of minutes have passed since midnight

df1['Minutes_Past_Midnight'] = df1['Datetime'].dt.hour * 60 + df1['Datetime'].dt.minute
df2['Minutes_Past_Midnight'] = df2['Datetime'].dt.hour * 60 + df2['Datetime'].dt.minute
df3['Minutes_Past_Midnight'] = df3['Datetime'].dt.hour * 60 + df3['Datetime'].dt.minute

In [187]:
print(df1.head())

             Datetime  Zone_temp  Ambient_temp Fan_status  Datetime_diff_mins  \
0 2023-03-23 14:21:08        NaN           NaN       None                 NaN   
1 2023-03-23 14:31:08        NaN           NaN       None                10.0   
2 2023-03-23 14:41:09        NaN           NaN       None                10.0   
3 2023-03-23 14:51:12        NaN           NaN       None                10.0   
4 2023-03-23 15:01:08        NaN           NaN       None                 9.0   

   Month  Day  Zone_temp_diff  Ambient_temp_diff  Cumulative_damper_open_mins  \
0      3   23             NaN                NaN                          NaN   
1      3   23             NaN                NaN                          NaN   
2      3   23             NaN                NaN                          NaN   
3      3   23             NaN                NaN                          NaN   
4      3   23             NaN                NaN                          NaN   

   ...  Damper_status_93.0

In [188]:
print(df2.head())

             Datetime  Zone_temp  Slab_temp  Dew_temp  Ambient_temp  Zone_c02  \
0 2023-03-27 21:34:59      19.97        NaN       NaN           NaN       NaN   
1 2023-03-27 21:41:31      19.94        NaN       NaN           NaN       NaN   
2 2023-03-27 21:43:07      19.94        NaN       NaN           NaN       NaN   
3 2023-03-27 21:46:11      19.93        NaN       NaN           NaN       NaN   
4 2023-03-27 21:49:07      19.93        NaN       NaN           NaN       NaN   

  Fan_status  Datetime_diff_mins  Month  Day  ...  DOW_3  DOW_4  DOW_5  DOW_6  \
0        Off                 NaN      3   27  ...  False  False  False  False   
1        Off                 6.0      3   27  ...  False  False  False  False   
2        Off                 1.0      3   27  ...  False  False  False  False   
3        Off                 3.0      3   27  ...  False  False  False  False   
4        Off                 2.0      3   27  ...  False  False  False  False   

   Season_1  Season_2  Sea

In [189]:
print(df3.head())

             Datetime  Zone_temp  Slab_temp  Dew_temp  Ambient_temp  Zone_c02  \
0 2023-01-01 00:02:41        NaN        NaN       NaN          21.4       NaN   
1 2023-01-01 00:13:52        NaN        NaN       NaN          21.0       NaN   
2 2023-01-01 00:25:03        NaN        NaN       NaN          20.7       NaN   
3 2023-01-01 00:36:14        NaN        NaN       NaN          20.4       NaN   
4 2023-01-01 00:47:20        NaN        NaN       NaN          20.1       NaN   

  Fan_status  Datetime_diff_mins  Month  Day  ...  DOW_5  DOW_6  \
0       None                 NaN      1    1  ...  False   True   
1       None                11.0      1    1  ...  False   True   
2       None                11.0      1    1  ...  False   True   
3       None                11.0      1    1  ...  False   True   
4       None                11.0      1    1  ...  False   True   

   Louver_status_Close  Louver_status_Open  Season_1  Season_2  Season_3  \
0                False            

In [190]:
# Removing the datetime column from each data
del df1["Datetime"]
del df2["Datetime"]
del df3["Datetime"]


In [191]:
X_df1 = df1.drop(columns=['Fan_status']) # Featuring
y_df1 = df1['Fan_status'] # Target

In [192]:
X_train_df1, X_temp_df1, y_train_df1, y_temp_df1 = train_test_split(X_df1, y_df1, test_size=0.2, random_state=42)


In [193]:
# Re-Split
X_val_df1, X_test_df1, y_val_df1, y_test_df1 = train_test_split(X_temp_df1, y_temp_df1, test_size=0.5, random_state=42)

In [194]:
X_df2 = df2.drop(columns=['Fan_status'])
y_df2 = df2['Fan_status']

In [195]:
X_train_df2, X_temp_df2, y_train_df2, y_temp_df2 = train_test_split(X_df2, y_df2, test_size=0.2, random_state=42)


In [196]:
# Resplit
X_val_df2, X_test_df2, y_val_df2, y_test_df2 = train_test_split(X_temp_df2, y_temp_df2, test_size=0.5, random_state=42)

In [197]:
X_df3 = df3.drop(columns=['Fan_status'])
y_df3 = df3['Fan_status']

In [198]:
X_train_df3, X_temp_df3, y_train_df3, y_temp_df3 = train_test_split(X_df3, y_df3, test_size=0.2, random_state=42)

In [199]:
# Resplit
X_val_df3, X_test_df3, y_val_df3, y_test_df3 = train_test_split(X_temp_df3, y_temp_df3, test_size=0.5, random_state=42)

In [200]:
# df1
print("df1:")
print(f"Training set size: {X_train_df1.shape[0]} rows, {y_train_df1.shape[0]} target rows")
print(f"Validation set size: {X_val_df1.shape[0]} rows, {y_val_df1.shape[0]} target rows")
print(f"Testing set size: {X_test_df1.shape[0]} rows, {y_test_df1.shape[0]} target rows")
print(f"Total rows: {X_train_df1.shape[0] + X_val_df1.shape[0] + X_test_df1.shape[0]} rows")

# df2
print("\ndf2:")
print(f"Training set size: {X_train_df2.shape[0]} rows, {y_train_df2.shape[0]} target rows")
print(f"Validation set size: {X_val_df2.shape[0]} rows, {y_val_df2.shape[0]} target rows")
print(f"Testing set size: {X_test_df2.shape[0]} rows, {y_test_df2.shape[0]} target rows")
print(f"Total rows: {X_train_df2.shape[0] + X_val_df2.shape[0] + X_test_df2.shape[0]} rows")

# df3
print("\ndf3:")
print(f"Training set size: {X_train_df3.shape[0]} rows, {y_train_df3.shape[0]} target rows")
print(f"Validation set size: {X_val_df3.shape[0]} rows, {y_val_df3.shape[0]} target rows")
print(f"Testing set size: {X_test_df3.shape[0]} rows, {y_test_df3.shape[0]} target rows")
print(f"Total rows: {X_train_df3.shape[0] + X_val_df3.shape[0] + X_test_df3.shape[0]} rows")


df1:
Training set size: 1677720 rows, 1677720 target rows
Validation set size: 209715 rows, 209715 target rows
Testing set size: 209715 rows, 209715 target rows
Total rows: 2097150 rows

df2:
Training set size: 416750 rows, 416750 target rows
Validation set size: 52094 rows, 52094 target rows
Testing set size: 52094 rows, 52094 target rows
Total rows: 520938 rows

df3:
Training set size: 1554012 rows, 1554012 target rows
Validation set size: 194251 rows, 194251 target rows
Testing set size: 194252 rows, 194252 target rows
Total rows: 1942515 rows


In [201]:
df1['Fan_status'].value_counts()

Fan_status
On     779635
Off    278873
Name: count, dtype: int64

In [202]:
df2['Fan_status'].value_counts()

Fan_status
Off    126249
On      18456
Name: count, dtype: int64

In [203]:
df3['Fan_status'].value_counts()

Fan_status
Off    467525
On     152400
Name: count, dtype: int64

In [204]:
count_df1 = df1['Fan_status'].value_counts()
num_off_df1 = count_df1['Off']
max_on_df1 = num_off_df1 * 3

In [205]:
count_df2 = df2['Fan_status'].value_counts()
num_on_df2 = count_df2['On']
max_off_df2 = num_on_df2 * 3

In [206]:
count_df3 = df3['Fan_status'].value_counts()
num_on_df3 = count_df3['On']
max_off_df3 = num_on_df3 * 3

In [207]:
# Filtering rows where fanstatus is on AKA Extracting majority class
major_class_df1 = df1[df1['Fan_status'] == 'On'] # Assuming that 'On' is the majority class for df1
major_class_df2 = df2[df2['Fan_status'] == 'Off'] # Assuming 'off" is majority
major_class_df3 = df3[df3['Fan_status'] == 'Off'] 


In [None]:
# Undersampling majority class 

sample_major_df1 = major_class_df1.sample(n=min(max_on_df1, len(major_class_df1)), random_state=42) # Randomly Samples n rows  from major class
sample_major_df2 = major_class_df2.sample(n=min(max_off_df2, len(major_class_df2)), random_state=42) 
sample_major_df3 = major_class_df3.sample(n=min(max_off_df3, len(major_class_df3)), random_state=42)

In [None]:
# Identifying rows to Drop to proceed with Undersampling

drop_indices_df1 = major_class_df1.index.difference(sample_major_df1.index) 
drop_indices_df2 = major_class_df2.index.difference(sample_major_df2.index)
drop_indices_df3 = major_class_df3.index.difference(sample_major_df3.index)
# .index retrieves row indices of majorclass DF
# .difference finds the difference between two sets of indices, which identifies which rows are in the first DF but not in the subset


In [210]:
# Dropping Rows from Original DFs AKA Undersampling here

df1.drop(index=drop_indices_df1, inplace=True) # Drops rows whose indices not sampled in samplemajor
df2.drop(index=drop_indices_df2, inplace=True) 
df3.drop(index=drop_indices_df3, inplace=True)
# inplace=True modifies og dataframe in place; changes are applied directly without needing to assign result to new variable


In [211]:
df1['Fan_status'].value_counts()

Fan_status
On     779635
Off    278873
Name: count, dtype: int64

In [212]:
df2['Fan_status'].value_counts()

Fan_status
Off    55368
On     18456
Name: count, dtype: int64

In [213]:
df3['Fan_status'].value_counts()

Fan_status
Off    457200
On     152400
Name: count, dtype: int64

In [214]:
# Check unique values in the 'Faulty' column
print(X_train_df1['Faulty'].unique())


[False  True]


In [215]:
# Removing Faulty Data 


# Filters out rows where Faulty is True keep only the ones where it is false
X_train_df1 = X_train_df1[X_train_df1['Faulty'] == False]
X_train_df2 = X_train_df2[X_train_df2['Faulty'] == False]
X_train_df3 = X_train_df3[X_train_df3['Faulty'] == False]

# Ensures that y_train_df1 matches the filtered X_train_df1
y_train_df1 = y_train_df1[X_train_df1.index]
y_train_df2 = y_train_df2[X_train_df2.index]
y_train_df3 = y_train_df3[X_train_df3.index]

# Filters out rows where Faulty is True for validation and test sets too
X_val_df1 = X_val_df1[X_val_df1['Faulty'] == False]
X_val_df2 = X_val_df2[X_val_df2['Faulty'] == False]
X_val_df3 = X_val_df3[X_val_df3['Faulty'] == False]

# Ensures that y_val_df1 matches the filtered X_val_df1
y_val_df1 = y_val_df1[X_val_df1.index]
y_val_df2 = y_val_df2[X_val_df2.index] 
y_val_df3 = y_val_df3[X_val_df3.index]


X_test_df1 = X_test_df1[X_test_df1['Faulty'] == False]
X_test_df2 = X_test_df2[X_test_df2['Faulty'] == False]
X_test_df3 = X_test_df3[X_test_df3['Faulty'] == False]


y_test_df1 = y_test_df1[X_test_df1.index]
y_test_df2 = y_test_df2[X_test_df2.index] 
y_test_df3 = y_test_df3[X_test_df3.index]


In [216]:
# Drop the 'Faulty' column as it's no longer needed
X_train_df1 = X_train_df1.drop(columns=['Faulty'])
X_val_df1 = X_val_df1.drop(columns=['Faulty'])
X_test_df1 = X_test_df1.drop(columns=['Faulty'])

X_train_df2 = X_train_df2.drop(columns=['Faulty'])
X_val_df2 = X_val_df2.drop(columns=['Faulty'])
X_test_df2 = X_test_df2.drop(columns=['Faulty'])

X_train_df3 = X_train_df3.drop(columns=['Faulty'])
X_val_df3 = X_val_df3.drop(columns=['Faulty'])
X_test_df3 = X_test_df3.drop(columns=['Faulty'])


In [217]:
print(X_train_df1.shape, y_train_df1.shape)
print(X_val_df1.shape, y_val_df1.shape)
print(X_test_df1.shape, y_test_df1.shape)


(846968, 98) (846968,)
(105912, 98) (105912,)
(105628, 98) (105628,)


In [218]:
# Separating imputers to ensure each imputer is fitted based on the individual training data's means 
imputer_df1 = SimpleImputer(strategy='mean')
imputer_df2 = SimpleImputer(strategy='mean')
imputer_df3 = SimpleImputer(strategy='mean')



In [219]:
# Imputing missing values in X_train, X_val, and X_test for each DF
X_train_df1 = imputer_df1.fit_transform(X_train_df1) #fit_transform calculates mean of each feature replaces missing values in the train set
X_val_df1 = imputer_df1.transform(X_val_df1) # transform() uses same means calc from the train to replace missing values in val and test
X_test_df1 = imputer_df1.transform(X_test_df1)

X_train_df2 = imputer_df2.fit_transform(X_train_df2)
X_val_df2 = imputer_df2.transform(X_val_df2)
X_test_df2 = imputer_df2.transform(X_test_df2)

X_train_df3 = imputer_df3.fit_transform(X_train_df3)
X_val_df3 = imputer_df3.transform(X_val_df3)
X_test_df3 = imputer_df3.transform(X_test_df3)


In [220]:
# Check for missing values in y_train_df1
missing_values = y_train_df1.isnull().sum()
print(f"Missing values in y_train_df1: {missing_values}")


Missing values in y_train_df1: 0


In [221]:
rf_model_df1 = RandomForestClassifier(n_estimators=100, max_depth=15, max_features='sqrt', min_samples_split=10, min_samples_leaf=4, random_state=42, n_jobs=-1)
rf_model_df2 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model_df3 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)


In [222]:
# Train on df1
rf_model_df1.fit(X_train_df1, y_train_df1)

# Train on df2
rf_model_df2.fit(X_train_df2, y_train_df2)

# Train on df3
rf_model_df3.fit(X_train_df3, y_train_df3)


In [223]:
# Predicts on the validation set for df1
y_val_pred_df1 = rf_model_df1.predict(X_val_df1)


In [224]:
# Removing rows where y_val_df1 has None values
valid_indices = y_val_df1.notnull()

# Filtering both the true labels and the predicted labels to remove None values
y_val_df1 = y_val_df1[valid_indices]
y_val_pred_df1 = y_val_pred_df1[valid_indices]

In [225]:
# Checking the lengths of the true and predicted labels
print("Length of y_val_df1 (true labels):", len(y_val_df1))
print("Length of y_val_pred_df1 (predicted labels):", len(y_val_pred_df1))


Length of y_val_df1 (true labels): 105912
Length of y_val_pred_df1 (predicted labels): 105912


In [226]:
# Checking the unique values in the true labels
print("Unique values in y_val_df1 (true labels):", y_val_df1.unique())

# Checking the unique values in the predicted labels
print("Unique values in y_val_pred_df1 (predicted labels):", np.unique(y_val_pred_df1))


Unique values in y_val_df1 (true labels): ['Off' 'On']
Unique values in y_val_pred_df1 (predicted labels): ['Off' 'On']


In [227]:
# Checking the data types
print("Data type of y_val_df1:", y_val_df1.dtype)
print("Data type of y_val_pred_df1:", y_val_pred_df1.dtype)


Data type of y_val_df1: object
Data type of y_val_pred_df1: object


In [228]:
# Prints classification report and confusion matrix for df1
print("Validation Set Evaluation for df1:")
print(classification_report(y_val_df1, y_val_pred_df1))
print(confusion_matrix(y_val_df1, y_val_pred_df1))

Validation Set Evaluation for df1:
              precision    recall  f1-score   support

         Off       1.00      0.78      0.88     28036
          On       0.93      1.00      0.96     77876

    accuracy                           0.94    105912
   macro avg       0.96      0.89      0.92    105912
weighted avg       0.95      0.94      0.94    105912

[[21862  6174]
 [   70 77806]]


In [229]:
dump(rf_model_df1, 'trained_rf_model.joblib')  # Saving the trained random forest model
# X_train_df1.to_parquet('cleaned_df1.parquet', index=False)  # Saving the cleaned training dataframe

['trained_rf_model.joblib']

In [230]:
y_val_pred_df2 = rf_model_df2.predict(X_val_df2)

In [231]:
# Removes rows where y_val_df1 has None values
valid_indices = y_val_df2.notnull()

# Filters both the true labels and the predicted labels to remove None values
y_val_df2 = y_val_df2[valid_indices]
y_val_pred_df2 = y_val_pred_df2[valid_indices]

In [232]:
print("Validation Set Evaluation for Df2:")
print(classification_report(y_val_df2, y_val_pred_df2))
print(confusion_matrix(y_val_df2, y_val_pred_df2))

Validation Set Evaluation for Df2:
              precision    recall  f1-score   support

         Off       0.98      0.99      0.99     12096
          On       0.96      0.86      0.91      1740

    accuracy                           0.98     13836
   macro avg       0.97      0.93      0.95     13836
weighted avg       0.98      0.98      0.98     13836

[[12031    65]
 [  247  1493]]


In [233]:
y_val_pred_df3 = rf_model_df3.predict(X_val_df3)

In [234]:
# Removes rows where y_val_df1 has None values
valid_indices = y_val_df3.notnull()

# Filters both the true labels and the predicted labels to remove None values
y_val_df3 = y_val_df3[valid_indices]
y_val_pred_df3 = y_val_pred_df3[valid_indices]

In [235]:
print("Validation Set Evaluation for DF3:")
print(classification_report(y_val_df3, y_val_pred_df3))
print(confusion_matrix(y_val_df3, y_val_pred_df3))


Validation Set Evaluation for DF3:
              precision    recall  f1-score   support

         Off       0.99      1.00      0.99     46636
          On       0.99      0.96      0.98     15190

    accuracy                           0.99     61826
   macro avg       0.99      0.98      0.98     61826
weighted avg       0.99      0.99      0.99     61826

[[46482   154]
 [  590 14600]]


In [236]:
scaler = StandardScaler()

In [237]:
label_encoder = LabelEncoder()

In [238]:
y_train_df1 = label_encoder.fit_transform(y_train_df1)
y_val_df1 = label_encoder.transform(y_val_df1)
y_test_df1 = label_encoder.transform(y_test_df1)

y_train_df2 = label_encoder.fit_transform(y_train_df2)
y_val_df2 = label_encoder.transform(y_val_df2)
y_test_df2 = label_encoder.transform(y_test_df2)

y_train_df3 = label_encoder.fit_transform(y_train_df3)
y_val_df3 = label_encoder.transform(y_val_df3)
y_test_df3 = label_encoder.transform(y_test_df3)

In [239]:
# Applying scaling to training, validation, and test sets
X_train_df1 = scaler.fit_transform(X_train_df1)
X_val_df1 = scaler.transform(X_val_df1)
X_test_df1 = scaler.transform(X_test_df1)

scaler.fit(X_train_df2) # Fit the scaler on training set before transforming.
# making sure validation and test sets are scaled with same stats as training set
X_train_df2 = scaler.transform(X_train_df2)
X_val_df2 = scaler.transform(X_val_df2)
X_test_df2 = scaler.transform(X_test_df2)

scaler.fit(X_train_df3)
X_train_df3 = scaler.transform(X_train_df3)
X_val_df3 = scaler.transform(X_val_df3)
X_test_df3 = scaler.transform(X_test_df3)

In [240]:
param_grid = {
    'n_neighbors': [3],  # Try different numbers of neighbors
    'weights': ['uniform'],  # Try both uniform and distance-based weighting
    'metric': ['euclidean']  # Try different distance metrics
}

In [241]:
# Initializes KNeighborsClassifier
knn_model_df1 = KNeighborsClassifier()  # Will be tuning the n_neighbors value
knn_model_df2 = KNeighborsClassifier()
knn_model_df3 = KNeighborsClassifier()

In [242]:
grid_search1 = GridSearchCV(knn_model_df1, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search2 = GridSearchCV(knn_model_df2, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search3 = GridSearchCV(knn_model_df3, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

In [243]:
print(f"Shape of X_val_df1: {X_val_df1.shape}")
print(f"Shape of y_val_df1: {y_val_df1.shape}")


Shape of X_val_df1: (105912, 98)
Shape of y_val_df1: (105912,)


In [244]:
# Checking for missing values in the NumPy arrays
#print(np.isnan(X_val_df1).sum())   # gives total count of nan values as numpy
#print(y_val_df1.isnull().sum())  # checks for nan values in panda series

In [245]:
grid_search1.fit(X_train_df1, y_train_df1)


Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [246]:
grid_search2.fit(X_train_df2, y_train_df2)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [247]:
grid_search3.fit(X_train_df3, y_train_df3)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [248]:
print("Best parameters found for df1: ", grid_search1.best_params_)
print("Best parameters for df2: ", grid_search2.best_params_)
print("Best parameters for df3: ", grid_search3.best_params_)

Best parameters found for df1:  {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best parameters for df2:  {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best parameters for df3:  {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}


In [249]:
print("Length of X_val_df1 (features):", len(X_val_df1))
print("Length of y_val_df1 (labels):", len(y_val_df1))


Length of X_val_df1 (features): 105912
Length of y_val_df1 (labels): 105912


In [250]:
print("Number of missing values in X_val_df1:", pd.isnull(X_val_df1).sum())
print("Number of missing values in y_val_df1:", pd.isnull(y_val_df1).sum())


Number of missing values in X_val_df1: 0
Number of missing values in y_val_df1: 0


In [251]:
# Uses the best estimator to predict on the validation set
best_knn1 = grid_search1.best_estimator_
y_val_pred_knn_df1 = best_knn1.predict(X_val_df1)

In [252]:
best_knn2 = grid_search2.best_estimator_
y_val_pred_knn_df2 = best_knn2.predict(X_val_df2)

In [253]:
best_knn3 = grid_search3.best_estimator_
y_val_pred_knn_df3 = best_knn3.predict(X_val_df3)

In [254]:
print("KNN Validation Set Evaluation for df1:")
print(classification_report(y_val_df1, y_val_pred_knn_df1))
print(confusion_matrix(y_val_df1, y_val_pred_knn_df1))

KNN Validation Set Evaluation for df1:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     28036
           1       0.99      1.00      0.99     77876

    accuracy                           0.99    105912
   macro avg       0.99      0.99      0.99    105912
weighted avg       0.99      0.99      0.99    105912

[[27511   525]
 [  327 77549]]


In [255]:
dump(best_knn1, 'trained_knn_model.joblib')  # Saving the trained random forest model
df1.to_parquet('cleaned_df1.parquet', index=False)  # Saving the cleaned training dataframe

In [256]:
print("KNN Validation Set Evaluation for df2:")
print(classification_report(y_val_df2, y_val_pred_knn_df2))
print(confusion_matrix(y_val_df2, y_val_pred_knn_df2))

KNN Validation Set Evaluation for df2:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     12096
           1       0.87      0.81      0.84      1740

    accuracy                           0.96     13836
   macro avg       0.92      0.89      0.91     13836
weighted avg       0.96      0.96      0.96     13836

[[11878   218]
 [  336  1404]]


In [257]:
print("KNN Validation Set Evaluation for df3:")
print(classification_report(y_val_df3, y_val_pred_knn_df3))
print(confusion_matrix(y_val_df3, y_val_pred_knn_df3))

KNN Validation Set Evaluation for df3:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46636
           1       0.97      0.94      0.96     15190

    accuracy                           0.98     61826
   macro avg       0.98      0.97      0.97     61826
weighted avg       0.98      0.98      0.98     61826

[[46212   424]
 [  837 14353]]


In [258]:
param_grid_xgb = {
    'n_estimators': [10, 50],  # Number of boosting rounds
    'max_depth': [3, 5],         # Depth of each tree
    'learning_rate': [0.1],  # Learning rate
    'subsample': [0.8],         # Fraction of samples used per tree
    'colsample_bytree': [0.8]   # Fraction of features used per tree
}


In [259]:
grid_search_xgb_df1 = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid_xgb,
    cv=3,  # 5-fold cross-validation here
    scoring='accuracy',  # Measuring accuracy can be changed easily
    verbose=1
)

grid_search_xgb_df2 = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid_xgb,
    cv=3,  # 5-fold cross-validation here
    scoring='accuracy',  # Measuring accuracy can be changed easily
    verbose=1
)

grid_search_xgb_df3 = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid_xgb,
    cv=3,  # 5-fold cross-validation here
    scoring='accuracy',  # Measuring accuracy can be changed easily
    verbose=1
)



In [260]:
# fitting grid search model onto the training data for df1
grid_search_xgb_df1.fit(X_train_df1, y_train_df1)



Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [261]:
grid_search_xgb_df2.fit(X_train_df2, y_train_df2)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [262]:
grid_search_xgb_df3.fit(X_train_df3, y_train_df3)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [263]:
# Best parameters for df1 will be displayed here
print("Best parameters for df1 XGB: ", grid_search_xgb_df1.best_params_)

Best parameters for df1 XGB:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}


In [264]:
print("Best parameters for df2 XGB: ", grid_search_xgb_df2.best_params_)

Best parameters for df2 XGB:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}


In [265]:
print("Best parameters for df3 XGB: ", grid_search_xgb_df3.best_params_)

Best parameters for df3 XGB:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}


In [266]:
# Best model for df1
best_xgb_df1 = grid_search_xgb_df1.best_estimator_

# Predicting on the validation set for df1
y_val_pred_xgb_df1 = best_xgb_df1.predict(X_val_df1)

In [267]:
# Best model for df2
best_xgb_df2 = grid_search_xgb_df2.best_estimator_

# Predicting on the validation set for df3
y_val_pred_xgb_df2 = best_xgb_df2.predict(X_val_df2)

In [268]:
# Best model for df3
best_xgb_df3 = grid_search_xgb_df3.best_estimator_

# Predicting on the validation set for df3
y_val_pred_xgb_df3 = best_xgb_df3.predict(X_val_df3)

In [269]:
# Validation set evaluation for df1
print("XGB Validation Set Evaluation for df1 with best parameters:")
print(classification_report(y_val_df1, y_val_pred_xgb_df1))
print(confusion_matrix(y_val_df1, y_val_pred_xgb_df1))

# Validation set evaluation for df2
print("XGB Validation Set Evaluation for df2:")
print(classification_report(y_val_df2, y_val_pred_xgb_df2))
print(confusion_matrix(y_val_df2, y_val_pred_xgb_df2))

# Validation set evaluation for df3
print("XGB Validation Set Evaluation for df3:")
print(classification_report(y_val_df3, y_val_pred_xgb_df3))
print(confusion_matrix(y_val_df3, y_val_pred_xgb_df3))


XGB Validation Set Evaluation for df1 with best parameters:
              precision    recall  f1-score   support

           0       0.99      0.84      0.91     28036
           1       0.94      1.00      0.97     77876

    accuracy                           0.95    105912
   macro avg       0.97      0.92      0.94    105912
weighted avg       0.96      0.95      0.95    105912

[[23495  4541]
 [  258 77618]]
XGB Validation Set Evaluation for df2:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     12096
           1       0.83      0.37      0.51      1740

    accuracy                           0.91     13836
   macro avg       0.87      0.68      0.73     13836
weighted avg       0.90      0.91      0.90     13836

[[11964   132]
 [ 1100   640]]
XGB Validation Set Evaluation for df3:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     46636
           1       0.92      0.86    