In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import useful_functions.habits_func as hf

### Explanation goes here

In [2]:
df = pd.read_csv("data/Yorkshire Water consumer habits.csv")

with open("data/habits_columns.csv", "r") as f:
    text_from_file = f.read()
cols_to_look_at = text_from_file.split(",")

df = df.loc[:, cols_to_look_at]
df.drop("County", axis=1, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13748 entries, 0 to 13747
Data columns (total 60 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Number-Of-Showers                     13748 non-null  int64  
 1   Number-Of-Toilets                     13748 non-null  int64  
 2   Number-Of-Basins                      13748 non-null  int64  
 3   Number-Of-Bathtubs                    13748 non-null  int64  
 4   Number-of-Kitchen-Utility-Taps        13748 non-null  int64  
 5   Water-Garden                          13314 non-null  object 
 6   Use-Pressure-Washer                   13313 non-null  object 
 7   Number-Of-People                      13748 non-null  int64  
 8   Home-Type                             13317 non-null  object 
 9   Water-Heated-With                     13748 non-null  object 
 10  Shower-Type                           12671 non-null  object 
 11  Showers-Per-Wee

In [4]:
mod_path = Path.cwd() / "data" / "habits_column_modifiers.txt"
col_mod = hf.read_column_modifiers(mod_path)

In [5]:
df = hf.add_classifier_columns(df, col_mod["classification"])

In [6]:
df = hf.modify_yes_no_columns(df, col_mod["yes-no"])

In [7]:
df = hf.modify_values_using_dict(df, col_mod["ordering"])

In [8]:
df = hf.modify_values_using_dict(df, col_mod["quantitative"])

In [9]:
df = hf.modify_leak_rate_columns(df)

In [10]:
# df.info()

In [11]:
col_counts = df.apply(lambda c: c.count(), axis=0)

min_count = 10_000
cols_to_drop = col_counts[col_counts < min_count].index

df.drop(cols_to_drop, axis=1, inplace=True)

In [12]:
df.dropna(inplace=True)

In [13]:
X = df.loc[:, :"Pressure-Washer-Frequency-Per-Week"]
y = df["Household-Water-Use-Litres-Yearly"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)
linear_model = LinearRegression()

In [15]:
best_cols = hf.fit_models_based_on_score(linear_model, X_train, X_test, y_train, y_test, up_to=70)

In [16]:
len(best_cols)  # 26

26

In [17]:
print(f'{"column name".ljust(35)} Cumulative score')
for i, col in enumerate(best_cols):
    linear_model.fit(X_train.loc[:, best_cols[:i+1]], y_train)
    print(f"{col.ljust(46)} {linear_model.score(X_test.loc[:, best_cols[:i+1]], y_test):.3f}")

column name                         Cumulative score
Showers-Per-Week                               0.449
Shower-Duration-Minutes                        0.619
Number-Of-People                               0.733
Washing-Machine-Per-Week                       0.783
Shower-Type_electric                           0.813
Bath-Frequency-Per-Week                        0.832
Basin-Shave-Number-Of-People                   0.839
Shower-Leak-Rate                               0.843
Shower-Type_gravity                            0.846
Toilet-Type_2-button-flush                     0.849
Kitchen-Tap-Flow-Rate                          0.852
Pressure-Washer-Frequency-Per-Week             0.855
Toilet-Leak-Rate                               0.858
Basin-Tap-Running-Brushing-Teeth               0.860
Kitchen-Tap-Leak-Rate                          0.860
Number-Of-Bathtubs                             0.861
Use-Pressure-Washer                            0.861
Garden-Water-Method_hosepipe-only             

In [18]:
linear_model.fit(X_train.loc[:, best_cols], y_train)
linear_model.score(X.loc[:, best_cols], y)  # 0.8778

0.8777587060826759