In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
import scikitplot as skplt

In [None]:
# Importing data
df = pd.read_csv("household_power_consumption.txt", sep = ";", low_memory=False)

In [None]:
# Data frame info
df.info()

In [None]:
# Checking first 10 rows
df.head(10)

In [None]:
# Renaming columns
df.columns = ["Date", "Time", "Active_power", "Reactive_power", 
              "Voltage", "Intensity", "Kitchen", "Laundry", "Heater"]

In [None]:
# Checking for duplicates
df.duplicated().any()

In [None]:
# Conversions
columns_float = df.columns.drop(["Date","Time"])
df[columns_float] = df[columns_float].apply(pd.to_numeric, errors="coerce")

df["Date_time"] = pd.to_datetime(df["Date"] + df["Time"], format="%d/%m/%Y%H:%M:%S")
df = df.drop(columns=["Date", "Time"])

In [None]:
# Removal of incomplete months
df = df[df.Date_time < pd.to_datetime("2010-11-01")]
df = df[df.Date_time >= pd.to_datetime("2007-01-01")]

In [None]:
# Index reset
df = df.reset_index(drop=True)

In [None]:
# Finding NaNs

def find_nans(dataframe, na_name=False):
    
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=["n_miss", "ratio"])
    print(missing_df, end="\n")

    if na_name:
        return na_columns

find_nans(df)

In [None]:
# NaNs matrix
nan_matrix = msno.matrix(df)
nan_matrix;

In [None]:
# Plotting histograms
hist_df = df.hist(figsize=(15,15))
hist_df;

In [None]:
# Imputing most frequent values
imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")

df["Active_power"] = imputer.fit_transform(df[["Active_power"]])
df["Reactive_power"] = imputer.fit_transform(df[["Reactive_power"]])
df["Voltage"] = imputer.fit_transform(df[["Voltage"]])
df["Intensity"] = imputer.fit_transform(df[["Intensity"]])
df["Kitchen"] = imputer.fit_transform(df[["Kitchen"]])
df["Laundry"] = imputer.fit_transform(df[["Laundry"]])
df["Heater"] = imputer.fit_transform(df[["Heater"]])

In [None]:
# Watts to killowats
df["Kitchen"] = df["Kitchen"].div(1000)
df["Laundry"] = df["Laundry"].div(1000)
df["Heater"] = df["Heater"].div(1000)

In [None]:
# Statistical analysis
df.describe().T.apply(lambda x: round(x,2))

In [None]:
# Changing minutes into hours
df = df.resample("D", on="Date_time").agg({"Active_power": "sum", 
                                           "Reactive_power": "sum",
                                           "Voltage": "mean",
                                           "Intensity": "mean",
                                           "Kitchen": "sum",
                                           "Laundry": "sum",
                                           "Heater": "sum"}).reset_index()

In [None]:
# Suuming up energy consumption
sum_of_energy = df["Kitchen"]+df["Laundry"]+df["Heater"]
df["Sum_of_energy"] = sum_of_energy

In [None]:
# Adding new columns
df["Hour"] = df.Date_time.dt.hour
df["Day"] = df.Date_time.dt.day
df["Day_of_year"] = df.Date_time.dt.dayofyear
df["Weekday"] = df.Date_time.dt.weekday
df["Month"] = df.Date_time.dt.month
df["Year"] = df.Date_time.dt.year

In [None]:
# Adding weekends
df["Weekend"] = df["Weekday"].apply(lambda x: 0 if x <5 else 1)

In [None]:
# Adding seasons
seasons = [4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4]
months_to_seasons = dict(zip(range(1,13), seasons))

df["Season"] = df.Month.map(months_to_seasons)

In [None]:
# Adding energy costs
def add_avg_cost(row):
    if row["Year"] == 2007:
        if row["Month"] == 1:
            return 10.06
        elif row["Month"] == 2:
            return 9.89
        elif row["Month"] == 3:
            return 10.27
        elif row["Month"] == 4:
            return 10.63
        elif row["Month"] == 5:
            return 10.77
        elif row["Month"] == 6:
            return 11.09
        elif row["Month"] == 7:
            return 11.07
        elif row["Month"] == 8:
            return 11.07
        elif row["Month"] == 9:
            return 10.96
        elif row["Month"] == 10:
            return 10.82
        elif row["Month"] == 11:
            return 10.70
        elif row["Month"] == 12:
            return 10.33
    elif row["Year"] == 2008:
        if row["Month"] == 1:
            return 10.14
        elif row["Month"] == 2:
            return 10.16
        elif row["Month"] == 3:
            return 10.45
        elif row["Month"] == 4:
            return 10.93
        elif row["Month"] == 5:
            return 11.40
        elif row["Month"] == 6:
            return 11.77
        elif row["Month"] == 7:
            return 12.07
        elif row["Month"] == 8:
            return 12.09
        elif row["Month"] == 9:
            return 11.92
        elif row["Month"] == 10:
            return 11.81
        elif row["Month"] == 11:
            return 11.42
        elif row["Month"] == 12:
            return 10.86
    elif row["Year"] == 2009:
        if row["Month"] == 1:
            return 10.98
        elif row["Month"] == 2:
            return 11.18
        elif row["Month"] == 3:
            return 11.28
        elif row["Month"] == 4:
            return 11.50
        elif row["Month"] == 5:
            return 11.78
        elif row["Month"] == 6:
            return 11.81
        elif row["Month"] == 7:
            return 11.85
        elif row["Month"] == 8:
            return 11.94
        elif row["Month"] == 9:
            return 11.96
        elif row["Month"] == 10:
            return 11.65
        elif row["Month"] == 11:
            return 11.26
        elif row["Month"] == 12:
            return 10.90
    elif row["Year"] == 2010:
        if row["Month"] == 1:
            return 10.89
        elif row["Month"] == 2:
            return 11.11
        elif row["Month"] == 3:
            return 11.11
        elif row["Month"] == 4:
            return 11.71
        elif row["Month"] == 5:
            return 11.91
        elif row["Month"] == 6:
            return 11.91
        elif row["Month"] == 7:
            return 12.04
        elif row["Month"] == 8:
            return 12.03
        elif row["Month"] == 9:
            return 11.95
        elif row["Month"] == 10:
            return 11.86
        
df["Avg_cost"] = df.apply(lambda row: add_avg_cost(row), axis=1)

In [None]:
# Adding daily energy cost
df["Sum_cost"] = df["Sum_of_energy"] * df["Avg_cost"]

In [None]:
# Statistical analysis
df.describe().T

In [None]:
# Correlation matrix
correlation_matrix = np.round(df.corr(), 3)
correlation_matrix

In [None]:
# Heatmap
sns.set(rc={"figure.figsize":(15,10)})
color_map = sns.diverging_palette(240, 10, n=10)
heatmap = sns.heatmap(correlation_matrix, cmap=color_map, annot=True, square=True);

In [None]:
# Creating new data frame with date as an index
df_date = df.copy(deep=True)
df_date.index = df_date["Date_time"]
df_date = df_date.drop(columns=["Date_time", "Hour", "Day", "Day_of_year", 
                       "Weekday", "Month", "Year", "Weekend", "Season", "Avg_cost"])

In [None]:
# Plotting all columns
fig, ax = plt.subplots(figsize = (20,24))
for i in range(len(df_date.columns)):
    plt.subplot(len(df_date.columns),1,i+1)
    name = df_date.columns[i]
    plt.plot(df_date[name])
    plt.title(name,y = 0,loc = "left")
    plt.yticks([])
fig.tight_layout()
plt.show()

In [None]:
# Plotting daily usage
df_date.Sum_of_energy.resample("D").sum().plot(figsize=(18,5),title="Dzienne zużycie prądu:")
plt.tight_layout()
plt.show() 

In [None]:
# Plotting monthly usage
df_date.Sum_of_energy.resample("M").sum().plot(kind="bar", figsize=(12,6),
                                               title="Miesięczne zużycie prądu:")
plt.tight_layout()
plt.show()

In [None]:
# Plotting quarterly usage
df_date.Sum_of_energy.resample("Q").sum().plot(kind="bar", figsize=(10,5),
                                               title="Kwartalne zużycie prądu:")
plt.tight_layout()
plt.show()

In [None]:
# Plotting usage year by year
years = ["2007", "2008", "2009", "2010"]
fig, ax = plt.subplots(figsize = (20,20))

for i in range(len(years)):
    plt.subplot(len(df_date.columns),1, i+1)
    year = years[i]
    sum_of_energy_data = df_date[str(year)]["Sum_of_energy"]
    plt.plot(sum_of_energy_data)
plt.show()

In [None]:
# Plotting additional usage plots
hours_plots = df_date.Sum_of_energy.resample("D").agg(["max", "mean", "min"])
hours_plots.plot(subplots=True, figsize=(20, 10), title="Zużycie prądu rok do roku:")
plt.show()

In [None]:
# Comparing usage year by year
plt.figure(figsize=(15,5))
plt.title("Zużycie prądu na przestrzeni lat oraz miesięcy:", y=1.015)
sns.barplot(x="Month", y="Sum_of_energy", hue="Year", data=df)
plt.legend(loc="upper right")
plt.show()

In [None]:
# Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

train_mask = df["Date_time"] < "2010-08-01"
test_mask = df["Date_time"] >= "2010-08-01"

df_train = df.loc[train_mask]
df_test = df.loc[test_mask]

In [None]:
# Running models
def run_model(model, X, y, cross_val):
    model_name = type(model).__name__

    rmse = np.sqrt(-cross_val_score(model, X, y, cv=cross_val, scoring="neg_root_mean_squared_error"))
    r2 = cross_val_score(model, X, y, cv=cross_val, scoring="r2")
    print("{} rmse: {:.4f}, r2: {:.1f}%".format(model_name, np.mean(rmse), 100*np.mean(r2)))

In [None]:
# Plotting feature importances
def plot_feature_importances(model, feat_list, ax):

    model_name = type(model).__name__
    skplt.estimators.plot_feature_importances(model, feature_names=df[feat_list].columns,
                                            title=f"{model_name} feature importances", ax=ax)
    plt.xticks(rotation=90)

In [None]:
# Models
Dummy_model = DummyRegressor(strategy="mean")
DT_model = DecisionTreeRegressor(max_depth=3, random_state=42)
RF_model = RandomForestRegressor(max_depth=3, random_state=42)
XGB_model = XGBRegressor(max_depth=3, objective="reg:squarederror", random_state=42)

models = [Dummy_model, DT_model, RF_model, XGB_model]

In [None]:
# Train / test split
def get_quant_features(df, black_list):
    feats = df.select_dtypes([np.number, np.bool]).columns
    return [x for x in feats if x not in black_list]

black_list = ["Sum_cost", "Sum_of_energy"]

feats = get_quant_features(df, black_list)

X_train = df_train[feats].values
y_train = df_train["Sum_cost"].values

X_test = df_test[feats].values
y_test = df_test["Sum_cost"].values

In [None]:
# Running models
for model in models:
    run_model(model, X_train, y_train, kf)

In [None]:
# Fitting
for model in models:
    model.fit(X_train, y_train)

In [None]:
# Feature importances
fig=plt.figure(figsize=(15,6))
for i, model in enumerate(models[1:]):
    ax=fig.add_subplot(1,3,i+1)
    plot_feature_importances(model, feats, ax=ax)
fig.tight_layout()
plt.show()