In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np

# scikit-learn: A machine learning library
from sklearn.model_selection import train_test_split
# train_test_split is used to split our data into training and testing sets, which is crucial for evaluating our model's performance on unseen data.

from sklearn.preprocessing import StandardScaler
# StandardScaler is used to normalize our feature data. This is important because KNN is sensitive to the scale of the features.

from sklearn.neighbors import KNeighborsClassifier
# This is the implementation of the K-Nearest Neighbors algorithm we'll use for classification.

from sklearn.metrics import classification_report, confusion_matrix
# These functions help us evaluate the performance of our classifier. They provide metrics like precision, recall, and F1-score.

In [5]:
# Load the data
data = pd.read_csv('sample_recs_data_knn.csv')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Building_id  100 non-null    int64  
 1   state        100 non-null    object 
 2   HDD65        99 non-null     float64
 3   TOTSQFT_EN   99 non-null     float64
 4   WINDOWS      99 non-null     float64
 5   BTUELCOL     99 non-null     float64
 6   TYPETHERM    99 non-null     float64
dtypes: float64(5), int64(1), object(1)
memory usage: 5.6+ KB


## Preprocessing data

In [7]:
# Check for NaN values
print("Number of NaN values in each column before cleaning:")
print(data.isna().sum())

# Remove rows with NaN values
data_cleaned = data.dropna()

# Check the number of rows before and after cleaning
print(f"\nNumber of rows before cleaning: {len(data)}")
print(f"Number of rows after cleaning: {len(data_cleaned)}")

# Check for NaN values again
print("\nNumber of NaN values in each column after cleaning:")
print(data_cleaned.isna().sum())

Number of NaN values in each column before cleaning:
Building_id    0
state          0
HDD65          1
TOTSQFT_EN     1
WINDOWS        1
BTUELCOL       1
TYPETHERM      1
dtype: int64

Number of rows before cleaning: 100
Number of rows after cleaning: 99

Number of NaN values in each column after cleaning:
Building_id    0
state          0
HDD65          0
TOTSQFT_EN     0
WINDOWS        0
BTUELCOL       0
TYPETHERM      0
dtype: int64


## kNN Binary Classification

In [8]:
# Prepare the features (X) and target variable (y)
X = data_cleaned[['HDD65', 'TOTSQFT_EN', 'WINDOWS', 'BTUELCOL']]
y = (data_cleaned['TYPETHERM'] == 3).astype(int)  # 1 (True) if smart thermostat, 0 otherwise

In [9]:
# Split the data into training and testing sets
# We use 80% of the data for training and 40% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2)
# random_state ensures reproducibility of the split

In [10]:
# Scale the features
# StandardScaler normalizes features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # fit part calculates the scaling parameters (mean and std) from the training data
                                               # transform part applies these parameters to scale the training data
X_test_scaled = scaler.transform(X_test) # this applies the scaling parameters that were calculated from the training data
                                         # this ensures our test data is scaled using the same parameters as our training data
# We fit on the training data and transform both training and test data otherwise it would leak information about 
# the test set into the scaling process.

In [11]:
# Create and train the KNN model
knn = KNeighborsClassifier(n_neighbors=3)  # You can adjust the number of neighbors
# n_neighbors=3 means each prediction is based on the 3 closest training samples
knn.fit(X_train_scaled, y_train) # training the model

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled) # making predictions

In [12]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# The confusion matrix shows true positives, false positives, true negatives, and false negatives

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# This report shows precision, recall, F1-score, and support for each class

Confusion Matrix:
[[33  1]
 [ 6  0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.97      0.90        34
           1       0.00      0.00      0.00         6

    accuracy                           0.82        40
   macro avg       0.42      0.49      0.45        40
weighted avg       0.72      0.82      0.77        40



In [13]:
# Predict for a new house
new_house = np.array([[3000, 2000, 3, 10000]])  # Example values
new_house_scaled = scaler.transform(new_house) # We need to scale the new data before making predictions
# We use the same scaler to normalize the new data
prediction = knn.predict(new_house_scaled)
print("\nPrediction for new house:")
print("Smart thermostat" if prediction[0] == 1 else "Not a smart thermostat")


Prediction for new house:
Not a smart thermostat




## Multi-class kNN classification

In [14]:
# Create a copy of the original data
data_multiclass = data_cleaned.copy()

# Remove rows where TYPETHERM is -2
data_multiclass = data_multiclass[data_multiclass['TYPETHERM'] != -2]
# 1: Manual thermostat, 2: Programmable thermostat, 3: Smart thermostat, 0: No thermostat

# Show distribution of TYPETHERM values
print("\nDistribution of TYPETHERM values after removing -2:")
print(data_multiclass['TYPETHERM'].value_counts())


Distribution of TYPETHERM values after removing -2:
TYPETHERM
2.0    42
1.0    40
3.0    10
0.0     6
Name: count, dtype: int64


In [15]:
# Prepare the features (X) and target variable (y)
X_multi = data_multiclass[['HDD65', 'TOTSQFT_EN', 'WINDOWS', 'BTUELCOL']]
y_multi = data_multiclass['TYPETHERM']

# Split the data into training and testing sets
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=15
)

In [16]:
# Scale the features
scaler_multi = StandardScaler()
X_train_multi_scaled = scaler_multi.fit_transform(X_train_multi)
X_test_multi_scaled = scaler_multi.transform(X_test_multi)

# Create and train the KNN model
knn_multi = KNeighborsClassifier(n_neighbors=3)
knn_multi.fit(X_train_multi_scaled, y_train_multi)

# Make predictions
y_pred_multi = knn_multi.predict(X_test_multi_scaled)

In [17]:
# Evaluate the model
print("\nConfusion Matrix:")
# Shows the model's prediction accuracy in terms of:
# - True Negatives (top-left)
# - False Positives (top-right)
# - False Negatives (bottom-left)
# - True Positives (bottom-right)
print(confusion_matrix(y_test_multi, y_pred_multi))
print("\nClassification Report:")
# Provides metrics for each class:
# - Precision: accuracy of positive predictions
# - Recall: ability to find all positive instances
# - F1-score: harmonic mean of precision and recall
# - Support: number of occurrences of each class
print(classification_report(y_test_multi, y_pred_multi))


Confusion Matrix:
[[0 2 3 0]
 [1 7 3 0]
 [1 8 3 0]
 [0 1 1 0]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         5
         1.0       0.39      0.64      0.48        11
         2.0       0.30      0.25      0.27        12
         3.0       0.00      0.00      0.00         2

    accuracy                           0.33        30
   macro avg       0.17      0.22      0.19        30
weighted avg       0.26      0.33      0.29        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Example prediction for a new house
print("\nExample prediction for a new house:")
new_house = np.array([[3000, 2000, 3, 10000]])
new_house_scaled = scaler_multi.transform(new_house)
prediction = knn_multi.predict(new_house_scaled)
print(f"Predicted thermostat type: {prediction[0]}")


Example prediction for a new house:
Predicted thermostat type: 2.0


