In [51]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


In [83]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [53]:
df = pd.read_csv("weatherHistory.csv")
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [54]:
df.shape

(96453, 12)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  float64
 8   Visibility (km)           96453 non-null  float64
 9   Loud Cover                96453 non-null  float64
 10  Pressure (millibars)      96453 non-null  float64
 11  Daily Summary             96453 non-null  object 
dtypes: float64(8), object(4)
memory usage: 8.8+ MB


In [56]:
df.columns

Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')

In [57]:
df.dtypes

Formatted Date               object
Summary                      object
Precip Type                  object
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)      float64
Visibility (km)             float64
Loud Cover                  float64
Pressure (millibars)        float64
Daily Summary                object
dtype: object

In [58]:
df[['Formatted Date', 'Summary', 'Precip Type']].nunique()

Formatted Date    96429
Summary              27
Precip Type           2
dtype: int64

In [59]:
df['Precip Type'].value_counts()

Precip Type
rain    85224
snow    10712
Name: count, dtype: int64

In [60]:
df['Precip Type Rain'] = (df['Precip Type'] == 'rain').astype(int)
df.drop('Precip Type', axis=1, inplace=True)

In [61]:
df.head()

Unnamed: 0,Formatted Date,Summary,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Precip Type Rain
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,1
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,1
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,1
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,1
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,1


In [62]:
# Convert the 'Formatted Date' column from string (object) to a proper datetime format
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], utc=True)

In [63]:
df.head()

Unnamed: 0,Formatted Date,Summary,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Precip Type Rain
0,2006-03-31 22:00:00+00:00,Partly Cloudy,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,1
1,2006-03-31 23:00:00+00:00,Partly Cloudy,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,1
2,2006-04-01 00:00:00+00:00,Mostly Cloudy,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,1
3,2006-04-01 01:00:00+00:00,Partly Cloudy,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,1
4,2006-04-01 02:00:00+00:00,Mostly Cloudy,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,1


In [64]:
df = df.sort_values('Formatted Date').reset_index(drop=True)

In [65]:
df['Time_Index'] = range(len(df))

In [66]:
df.head()

Unnamed: 0,Formatted Date,Summary,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Precip Type Rain,Time_Index
0,2005-12-31 23:00:00+00:00,Partly Cloudy,0.577778,-4.05,0.89,17.1143,140.0,9.982,0.0,1016.66,Mostly cloudy throughout the day.,1,0
1,2006-01-01 00:00:00+00:00,Mostly Cloudy,1.161111,-3.238889,0.85,16.6152,139.0,9.9015,0.0,1016.15,Mostly cloudy throughout the day.,1,1
2,2006-01-01 01:00:00+00:00,Mostly Cloudy,1.666667,-3.155556,0.82,20.2538,140.0,9.9015,0.0,1015.87,Mostly cloudy throughout the day.,1,2
3,2006-01-01 02:00:00+00:00,Overcast,1.711111,-2.194444,0.82,14.49,140.0,9.9015,0.0,1015.56,Mostly cloudy throughout the day.,1,3
4,2006-01-01 03:00:00+00:00,Mostly Cloudy,1.183333,-2.744444,0.86,13.9426,134.0,9.9015,0.0,1014.98,Mostly cloudy throughout the day.,1,4


In [67]:
date_map = df[['Time_Index', 'Formatted Date']]

In [68]:
date_map.head()

Unnamed: 0,Time_Index,Formatted Date
0,0,2005-12-31 23:00:00+00:00
1,1,2006-01-01 00:00:00+00:00
2,2,2006-01-01 01:00:00+00:00
3,3,2006-01-01 02:00:00+00:00
4,4,2006-01-01 03:00:00+00:00


In [69]:
df = df.drop(['Summary', 'Daily Summary'], axis=1)
df.head()

Unnamed: 0,Formatted Date,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Precip Type Rain,Time_Index
0,2005-12-31 23:00:00+00:00,0.577778,-4.05,0.89,17.1143,140.0,9.982,0.0,1016.66,1,0
1,2006-01-01 00:00:00+00:00,1.161111,-3.238889,0.85,16.6152,139.0,9.9015,0.0,1016.15,1,1
2,2006-01-01 01:00:00+00:00,1.666667,-3.155556,0.82,20.2538,140.0,9.9015,0.0,1015.87,1,2
3,2006-01-01 02:00:00+00:00,1.711111,-2.194444,0.82,14.49,140.0,9.9015,0.0,1015.56,1,3
4,2006-01-01 03:00:00+00:00,1.183333,-2.744444,0.86,13.9426,134.0,9.9015,0.0,1014.98,1,4


In [71]:
X = df.drop(['Temperature (C)', 'Formatted Date'], axis=1)
y = df['Temperature (C)']
X.head()

Unnamed: 0,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Precip Type Rain,Time_Index
0,-4.05,0.89,17.1143,140.0,9.982,0.0,1016.66,1,0
1,-3.238889,0.85,16.6152,139.0,9.9015,0.0,1016.15,1,1
2,-3.155556,0.82,20.2538,140.0,9.9015,0.0,1015.87,1,2
3,-2.194444,0.82,14.49,140.0,9.9015,0.0,1015.56,1,3
4,-2.744444,0.86,13.9426,134.0,9.9015,0.0,1014.98,1,4


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [74]:
X_train.head()

Unnamed: 0,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Precip Type Rain,Time_Index
52724,3.322222,0.71,25.76,200.0,11.27,0.0,993.6,1,52724
53227,-4.661111,0.92,10.787,339.0,3.6386,0.0,1021.8,0,53227
79958,3.133333,0.6,7.9534,91.0,7.4865,0.0,1025.29,1,79958
30328,26.005556,0.28,13.8943,319.0,10.3684,0.0,1022.3,1,30328
27062,-1.155556,0.82,16.1,80.0,11.27,0.0,1016.3,1,27062


In [79]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(77162, 9) (19291, 9) (77162,) (19291,)


In [78]:
X_train.columns


Index(['Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Precip Type Rain', 'Time_Index'],
      dtype='object')

In [80]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [81]:
X_train_scaled[:5]

array([[-0.70465894, -0.12725125,  2.16296756,  0.11238949,  0.21825262,
         0.        , -0.08322096,  0.36213568,  0.16264032],
       [-1.45190805,  0.94763251, -0.00309017,  1.40693781, -1.59918316,
         0.        ,  0.15848798, -2.76139592,  0.18073546],
       [-0.72233916, -0.6902856 , -0.41301077, -0.90275991, -0.68279676,
         0.        ,  0.1884016 ,  0.36213568,  1.1423682 ],
       [ 1.41852695, -2.32820371,  0.44642504,  1.22067186,  0.00353447,
         0.        ,  0.1627736 ,  0.36213568, -0.64304322],
       [-1.12378405,  0.4357831 ,  0.76551096, -1.00520618,  0.21825262,
         0.        ,  0.11134616,  0.36213568, -0.76053574]])

In [85]:
models = {
    'Linear Regression': LinearRegression(),
    'Multiple Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01),
    'ElasticNet Regression': ElasticNet(alpha=0.01, l1_ratio=0.5)
}

In [86]:
results = {}
# Normal Linear Regression
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2, 'MAE': mae}

In [87]:
# polynomial regression with degree 2
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train_scaled)
X_test_poly2 = poly2.transform(X_test_scaled)
poly2_model = LinearRegression()
poly2_model.fit(X_train_poly2, y_train)
y_pred_poly2 = poly2_model.predict(X_test_poly2)
mse_poly2 = mean_squared_error(y_test, y_pred_poly2)
r2_poly2 = r2_score(y_test, y_pred_poly2)
mae_poly2 = mean_absolute_error(y_test, y_pred_poly2)
results['Polynomial Regression (degree 2)'] = {'MSE': mse_poly2, 'R2': r2_poly2, 'MAE': mae_poly2}

In [88]:
# Polynomial regression with degree 3
poly3 = PolynomialFeatures(degree=3)
X_train_poly3 = poly3.fit_transform(X_train_scaled)
X_test_poly3 = poly3.transform(X_test_scaled)
poly3_model = LinearRegression()
poly3_model.fit(X_train_poly3, y_train)
y_pred_poly3 = poly3_model.predict(X_test_poly3)
mse_poly3 = mean_squared_error(y_test, y_pred_poly3)
r2_poly3 = r2_score(y_test, y_pred_poly3)
mae_poly3 = mean_absolute_error(y_test, y_pred_poly3)
results['Polynomial Regression (degree 3)'] = {'MSE': mse_poly3, 'R2': r2_poly3, 'MAE': mae_poly3}

In [89]:
results

{'Linear Regression': {'MSE': 0.8963406116039221,
  'R2': 0.9902606500142165,
  'MAE': 0.7408607979817191},
 'Multiple Linear Regression': {'MSE': 0.8963406116039221,
  'R2': 0.9902606500142165,
  'MAE': 0.7408607979817191},
 'Ridge Regression': {'MSE': 0.8963384856815465,
  'R2': 0.990260673113808,
  'MAE': 0.7408552585664077},
 'Lasso Regression': {'MSE': 0.8970664935271283,
  'R2': 0.9902527628137405,
  'MAE': 0.7406439177419358},
 'ElasticNet Regression': {'MSE': 0.9009303892585372,
  'R2': 0.9902107789603375,
  'MAE': 0.7407967461371588},
 'Polynomial Regression (degree 2)': {'MSE': 0.280660957033336,
  'R2': 0.9969504279372087,
  'MAE': 0.39316332709830953},
 'Polynomial Regression (degree 3)': {'MSE': 0.18917697162478403,
  'R2': 0.9979444636201327,
  'MAE': 0.326986114428073}}

In [90]:
pd.DataFrame(results).T

Unnamed: 0,MSE,R2,MAE
Linear Regression,0.896341,0.990261,0.740861
Multiple Linear Regression,0.896341,0.990261,0.740861
Ridge Regression,0.896338,0.990261,0.740855
Lasso Regression,0.897066,0.990253,0.740644
ElasticNet Regression,0.90093,0.990211,0.740797
Polynomial Regression (degree 2),0.280661,0.99695,0.393163
Polynomial Regression (degree 3),0.189177,0.997944,0.326986
