In [1]:
# linear regression = demand prediction model
import pandas as pd
import numpy as np

In [2]:
Combined = pd.read_csv('combined_detail_cleaned.csv',encoding= 'ISO-8859-1')
Combined.head()

Unnamed: 0,max_total_demand,max_price_category,temperature_min,temperature_max,rainfall,evaporation,sunshine,max_wind_direction,max_wind_speed,max_wind_time,...,cloud_9am,wind_direction_9am,wind_speed_9am,pressure_9am,temperature_3pm,humidity_3pm,cloud_3pm,wind_direction_3pm,wind_speed_3pm,pressure_3pm
0,5019.64,LOW,15.6,29.9,0.0,2.8,9.3,NNE,31,1:14,...,6,N,2,1018.8,28.1,43,5,E,13,1015.3
1,4964.35,LOW,18.4,29.0,0.0,9.4,1.3,NNW,30,8:22,...,7,NNW,17,1013.3,28.7,38,7,SW,4,1008.5
2,4503.31,LOW,17.0,26.2,12.6,4.8,7.1,WSW,33,5:55,...,8,WSW,4,1007.7,23.5,59,4,SSW,2,1005.2
3,4764.18,LOW,16.0,18.6,2.6,3.8,0.0,SSE,41,4:03,...,8,SSE,11,1010.0,18.2,82,8,SSW,17,1011.0
4,4800.64,LOW,15.9,19.1,11.2,1.0,0.0,SSE,35,11:02,...,8,SSE,13,1012.5,18.2,82,8,SSE,19,1013.3


In [3]:
#Get the value of the feature labels
FEATURES = list(Combined.columns.values)

In [4]:
# Analyse the correlations between max_total_demand with other numerical features, pick the features with abs correlation over 0.3
correlation_table = Combined[FEATURES].corr(method='pearson')
Possible_features = list(correlation_table.loc[abs(correlation_table['max_total_demand']) > 0.3].index)
Possible_features.remove('max_total_demand')
Possible_features

['temperature_min', 'temperature_9am', 'temperature_3pm']

In [5]:
#check the correlations between these possible features
Independent_correlation_table = Combined[Possible_features].corr(method='pearson')
Independent_correlation_table

Unnamed: 0,temperature_min,temperature_9am,temperature_3pm
temperature_min,1.0,0.916641,0.66627
temperature_9am,0.916641,1.0,0.765603
temperature_3pm,0.66627,0.765603,1.0


In [6]:
#Prepare the possible features label and output label again
Possible_features.append('max_total_demand')
Possible_features

['temperature_min', 'temperature_9am', 'temperature_3pm', 'max_total_demand']

In [11]:
#Analysis the correlation relationship between output and filtered features
Combined[Possible_features].corr(method='pearson')

Unnamed: 0,temperature_min,temperature_9am,temperature_3pm,max_total_demand
temperature_min,1.0,0.916641,0.66627,-0.49924
temperature_9am,0.916641,1.0,0.765603,-0.401323
temperature_3pm,0.66627,0.765603,1.0,-0.325252
max_total_demand,-0.49924,-0.401323,-0.325252,1.0


In [12]:
# Since all possible features are self correlated, then just pick the one with highest correlation with output
Final_features = list(correlation_table.loc[abs(correlation_table['max_total_demand']) > 0.45].index)
Final_features.remove('max_total_demand')

In [13]:
#Set the features and output into X , y
X = Combined[Final_features]
y = Combined['max_total_demand']

In [27]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.model_selection import KFold

# Prepare the experimental method to get the average performance
k=10
kf = KFold(n_splits=k, shuffle=True, random_state=42)
R2_score = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]    
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    y_pred = lm.predict(X_test)   
    R2_score.append(r2_score(y_test, y_pred))

print(R2_score)
print(sum(R2_score)/k)

[-0.004151406159014215, 0.07986532903011845, 0.3543378605150912, 0.3044004894717397, 0.32230799365732965, 0.5261138598590477, -0.17843716479663851, 0.4482458440032411, 0.15479853624682471, 0.5349552562367903]
0.25424365980645297


In [None]:
Combined[Final_features].dtypes

In [None]:
Combined['AVGt_tem'] = round((Combined['temperature_min'] + Combined['temperature_max'] + Combined['temperature_9am'] + Combined['temperature_3pm'])/4, 2)
Possible_features = ['AVGt_tem']
#+ round(total_sales['tickets_sold'] / total_sales['max_capacity'], 2)

In [None]:
Possible_Select_FEATURES.append('max_total_demand')
Possible_Select_FEATURES

In [None]:
Combined[Possible_Select_FEATURES].corr(method='pearson')

In [None]:
Possible_features = [ 'temperature_max',
                     'sunshine',
                     'humidity_9am',
                     'cloud_9am',
                     'wind_speed_9am',
                    'max_total_demand']

In [None]:
Combined[Possible_features].corr(method='pearson')

In [None]:
X = Combined[Possible_features]
y = Combined['max_total_demand']

# With K folder method

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.model_selection import KFold

k=10
kf = KFold(n_splits=k, shuffle=True, random_state=42)
R2_score = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]    
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    y_pred = lm.predict(X_test)
    
    
    R2_score.append(r2_score(y_test, y_pred))

print(R2_score)
print(sum(R2_score)/k)


In [None]:
y_test

In [None]:
# Step 0: Preprocess
# randomly select 80% of the instances to be training and the rest to be testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

# Without K folder method

In [None]:
# Step 1: Instantiate
lm = linear_model.LinearRegression()

In [None]:
# Step 2: Fit
lm.fit(X_train, y_train)

In [None]:
print(lm.coef_,lm.intercept_)

In [None]:
r2_test = lm.score(X_test, y_test)
print(r2_test)

In [None]:
# Step 3: Predict
y_pred = lm.predict(X_test)

In [None]:
mean_squared_error(y_pred, y_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
# Step 4: Evaluate
print(f'mean squared error: {mean_squared_error(y_pred, y_test)}')
print(f'r2 score: {r2_score(y_test, y_pred)}')

In [None]:
# Calculate the Pearson's R correlation metric
Combined[FEATURES] = Combined[FEATURES].astype(float)
Combined[FEATURES].corr(method='pearson')