In [6]:
# Import necessary libraries

# pandas: Library for data manipulation and analysis
import pandas as pd
# Used for reading CSV files, data manipulation, and creating DataFrames

# numpy: Library for numerical operations
import numpy as np
# Used for array operations and numerical computations

# scikit-learn: Machine learning library
from sklearn.model_selection import train_test_split
# Used to split data into training and testing sets to evaluate model performance

from sklearn.linear_model import LogisticRegression
# Implementation of logistic regression model for binary classification

from sklearn.metrics import classification_report, confusion_matrix
# Tools for evaluating model performance:
# - classification_report: provides precision, recall, f1-score
# - confusion_matrix: shows true positives, false positives, true negatives, false negatives


In [7]:
# Load and preprocess the data
# Read the CSV file into a pandas DataFrame
data = pd.read_csv('sample_recs_data_LogisticR.csv')

In [8]:
# Data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Building_id  100 non-null    int64  
 1   state        100 non-null    object 
 2   HDD65        99 non-null     float64
 3   TOTSQFT_EN   99 non-null     float64
 4   WINDOWS      99 non-null     float64
 5   BTUELCOL     99 non-null     float64
 6   TYPETHERM    99 non-null     float64
dtypes: float64(5), int64(1), object(1)
memory usage: 5.6+ KB


In [9]:
# Create binary classification column 'THERMO'
# Convert TYPETHERM values to binary classification:
# - Values 1, 2, 3 become 1 (Yes)
# - Values 0, -2 become 0 (No)
data['THERMO'] = data['TYPETHERM'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

In [10]:
# Display sample of the new classification to verify the transformation
print("First few rows with new THERMO classification:")
print(data[['TYPETHERM', 'THERMO']].head(10))

# Analyze the distribution of our target variable
print("\nDistribution of THERMO classifications:")
print(data['THERMO'].value_counts())
# Show percentages to understand class balance/imbalance
print("\nPercentage distribution:")
print(data['THERMO'].value_counts(normalize=True) * 100)

First few rows with new THERMO classification:
   TYPETHERM  THERMO
0        1.0       1
1        1.0       1
2        1.0       1
3        1.0       1
4        2.0       1
5        3.0       1
6        1.0       1
7        1.0       1
8        2.0       1
9        1.0       1

Distribution of THERMO classifications:
THERMO
1    92
0     8
Name: count, dtype: int64

Percentage distribution:
THERMO
1    92.0
0     8.0
Name: proportion, dtype: float64


In [11]:
# Data cleaning
# Remove rows with missing values to ensure model reliability
data_cleaned = data.dropna()
print("\nRows remaining after removing missing values:", len(data_cleaned))


Rows remaining after removing missing values: 99


In [12]:
# Prepare data for modeling
# X: feature matrix - selected variables that might influence thermostat adoption
X = data_cleaned[['HDD65', 'TOTSQFT_EN', 'WINDOWS', 'BTUELCOL']]
# y: target variable - binary classification of thermostat adoption
y = data_cleaned['THERMO']

In [13]:
# Split data into training and testing sets
# test_size=0.2 means 80% training, 20% testing
# random_state ensures reproducibility of results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [14]:
# Create and train logistic regression model
# random_state ensures reproducibility
model = LogisticRegression(random_state=4)
# Fit the model on training data
model.fit(X_train, y_train)

# Make predictions on test set
y_pred = model.predict(X_test)

In [15]:
# Evaluate model performance
print("\nModel Performance:")

# Create and display confusion matrix
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


Model Performance:

Confusion Matrix:
[[ 0  2]
 [ 0 18]]


In [16]:
# Display detailed classification metrics
print("\nClassification Report:")
# Provides metrics for each class:
# - Precision: accuracy of positive predictions
# - Recall: ability to find all positive instances
# - F1-score: harmonic mean of precision and recall
# - Support: number of occurrences of each class
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.90      1.00      0.95        18

    accuracy                           0.90        20
   macro avg       0.45      0.50      0.47        20
weighted avg       0.81      0.90      0.85        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Get the coefficient and feature names
coefficients = model.coef_[0]
intercept = model.intercept_[0]
features = X.columns

In [21]:
# Print the mathematical form of the equation
print("Mathematical form:")
print(f"log(p/(1-p)) = {intercept:.4f}", end='') # end='' prevents newline
for coef, feature in zip(coefficients, features): # zip() pairs coefficients with feature names
    if coef >= 0: # if coefficients are positive
        print(f" + {coef:.4f}*{feature}", end='')
    else: # if coefficients are negative
        print(f" - {abs(coef):.4f}*{feature}", end='')

Mathematical form:
log(p/(1-p)) = 0.8064 + 0.0001*HDD65 + 0.0010*TOTSQFT_EN - 0.2596*WINDOWS + 0.0004*BTUELCOL