In [1]:
# Import libraries
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [2]:
# Load data
df = pd.read_csv('survey_data.csv')

In [3]:
# Print column names
print(df.columns)

Index(['household_size', 'dwelling_type', 'attitude',
       'weather_effects_on_electricity_consumption',
       'weather_conditions_most_affecting', 'awareness_about_energy_policies',
       'weather-based_energy_management_strategies_adoption', 'challenges',
       'monitoring', 'checking_consumption',
       'appliances_using_the_most_electricity',
       'self_reported_peak_consumption_times', 'provider_info_rating', 'age',
       'education', 'occupation', 'income', 'comments'],
      dtype='object')


In [4]:
# Select Feature and target definition
features = df[['weather_effects_on_electricity_consumption', 'awareness_about_energy_policies', 'income', 'attitude']]
target = df['weather-based_energy_management_strategies_adoption']

In [5]:
# Splitting data intotesting sets,training sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [6]:
# One-Hot Encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [7]:
# columns in datasets 
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [8]:
# To fill NaN values
X_test = X_test.fillna(0)

In [9]:
# Train a Logistic Regression model 
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [10]:
# Train a Decision Tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [11]:
# Train a Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [12]:
# predictions with models
lr_predictions = lr.predict(X_test)
dt_predictions = dt.predict(X_test)
rf_predictions = rf.predict(X_test)

In [13]:
#  Logistic Regression model evaluation
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions, average='weighted', zero_division=1)
lr_recall = recall_score(y_test, lr_predictions, average='weighted', zero_division=1)
lr_f1 = f1_score(y_test, lr_predictions, average='weighted')

#  Random Forest model evaluation
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted', zero_division=1)
rf_recall = recall_score(y_test, rf_predictions, average='weighted', zero_division=1)
rf_f1 = f1_score(y_test, rf_predictions, average='weighted')

#  Decision Tree model evaluation
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions, average='weighted', zero_division=1)
dt_recall = recall_score(y_test, dt_predictions, average='weighted', zero_division=1)
dt_f1 = f1_score(y_test, dt_predictions, average='weighted')

# Print all the metrics
print("Logistic Regression: Accuracy = ", lr_accuracy, " Precision = ", lr_precision, " Recall = ", lr_recall, " F1 Score = ", lr_f1)
print("Decision Tree: Accuracy = ", dt_accuracy, " Precision = ", dt_precision, " Recall = ", dt_recall, " F1 Score = ", dt_f1)
print("Random Forest: Accuracy = ", rf_accuracy, " Precision = ", rf_precision, " Recall = ", rf_recall, " F1 Score = ", rf_f1)

Logistic Regression: Accuracy =  0.45  Precision =  0.5916301169590643  Recall =  0.45  F1 Score =  0.3977232142857143
Decision Tree: Accuracy =  0.475  Precision =  0.5269097222222222  Recall =  0.475  F1 Score =  0.4458374384236453
Random Forest: Accuracy =  0.425  Precision =  0.503968253968254  Recall =  0.425  F1 Score =  0.3925884811283793
