<a href="https://colab.research.google.com/github/kai054631/Weather_Data/blob/main/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.model_selection import KFold, cross_val_score

from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [42]:
df = read_csv("https://raw.githubusercontent.com/Des282/Dataset/refs/heads/main/seattle-weather.csv")#store and read
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [43]:
df.isna().sum()

Unnamed: 0,0
date,0
precipitation,0
temp_max,0
temp_min,0
wind,0
weather,0


In [44]:
df.shape

(1461, 6)

In [45]:
df = df.drop(columns = ["date"])
X = df.drop(columns=["weather"])
y = df["weather"]

In [46]:
X.shape

(1461, 4)

In [47]:
#Use spot-checking to quickly evaluate the performance of 8 machine learning algorithms
models = {}
models['lgr'] = LR()
models['knn'] = KNN()
models['gnb'] = GNB()
models['svc'] = SVC()
models['dtc'] = DTC()
models['rfc'] = RFC()
models['gbc'] = GBC()
models['mlp'] = MLP()

kf = KFold(n_splits=3, shuffle=True, random_state=42) #set k=3 fold cross validation
kf1 = KFold(n_splits=5, shuffle=True, random_state=42) #set k=3 fold cross validation

for n in models:
    scores = cross_val_score(models[n], X, y, cv=kf, n_jobs=-1) #get the accuracy
    print(f"{n}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy

lgr: 84.326%, 0.968%
knn: 75.838%, 1.078%
gnb: 84.326%, 1.355%
svc: 77.481%, 1.078%
dtc: 75.702%, 1.445%
rfc: 82.957%, 0.731%
gbc: 82.409%, 0.953%
mlp: 84.326%, 0.774%


In [48]:
Robust_scl = RobustScaler()#Feature Scaling with RobustScaler
Xs1 = Robust_scl.fit_transform(X)

In [49]:
MinMax_scl = MinMaxScaler()#Feature Scaling with MinMaxScaler
Xs2 = MinMax_scl.fit_transform(X)

In [50]:
Standard_scl = StandardScaler()#Feature Scaling with StandardScaler
Xs3 = Standard_scl.fit_transform(X)

In [51]:
for n in models:
    scores = cross_val_score(models[n], Xs1, y, cv=kf, n_jobs=-1) #get the accuracy
    print(f"ROBUST SCALING {n}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for i in models:
    scores = cross_val_score(models[i], Xs2, y, cv=kf, n_jobs=-1) #get the accuracy
    print(f"MINMAX SCALING {i}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for j in models:
    scores = cross_val_score(models[j], Xs3, y, cv=kf, n_jobs=-1) #get the accuracy
    print(f"STANDARD SCALING {j}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy


ROBUST SCALING lgr: 81.588%, 0.512%
ROBUST SCALING knn: 77.892%, 0.512%
ROBUST SCALING gnb: 84.326%, 1.355%
ROBUST SCALING svc: 78.371%, 1.116%
ROBUST SCALING dtc: 75.633%, 0.968%
ROBUST SCALING rfc: 83.025%, 1.445%
ROBUST SCALING gbc: 82.204%, 0.844%
ROBUST SCALING mlp: 84.600%, 0.933%
 
MINMAX SCALING lgr: 73.238%, 0.860%
MINMAX SCALING knn: 70.363%, 0.635%
MINMAX SCALING gnb: 84.326%, 1.355%
MINMAX SCALING svc: 75.565%, 0.290%
MINMAX SCALING dtc: 75.017%, 1.426%
MINMAX SCALING rfc: 82.820%, 0.923%
MINMAX SCALING gbc: 82.409%, 1.078%
MINMAX SCALING mlp: 76.523%, 0.774%
 
STANDARD SCALING lgr: 78.166%, 0.678%
STANDARD SCALING knn: 71.458%, 1.099%
STANDARD SCALING gnb: 84.326%, 1.355%
STANDARD SCALING svc: 77.550%, 0.792%
STANDARD SCALING dtc: 75.907%, 1.091%
STANDARD SCALING rfc: 83.641%, 0.635%
STANDARD SCALING gbc: 82.478%, 0.982%
STANDARD SCALING mlp: 81.177%, 0.422%


In [52]:
#First method: #4/2 = 2 reducing the feature by half
pca = PCA(n_components=2)
Xsr1 = pca.fit_transform(Xs1)
Xsr2 = pca.fit_transform(Xs2)
Xsr3 = pca.fit_transform(Xs3)

In [53]:
for n in models:
    scores = cross_val_score(models[n], Xsr1, y, cv=kf, n_jobs=-1) #get the accuracy
    print(f"ROBUST SCALING {n}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for i in models:
    scores = cross_val_score(models[i], Xsr2, y, cv=kf, n_jobs=-1) #get the accuracy
    print(f"MINMAX SCALING {i}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for j in models:
    scores = cross_val_score(models[j], Xsr3, y, cv=kf, n_jobs=-1) #get the accuracy
    print(f"STANDARD SCALING {j}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy

ROBUST SCALING lgr: 78.850%, 0.290%
ROBUST SCALING knn: 77.139%, 0.512%
ROBUST SCALING gnb: 78.029%, 0.605%
ROBUST SCALING svc: 78.508%, 0.539%
ROBUST SCALING dtc: 70.089%, 2.443%
ROBUST SCALING rfc: 76.660%, 1.011%
ROBUST SCALING gbc: 78.234%, 0.581%
ROBUST SCALING mlp: 79.945%, 0.635%
 
MINMAX SCALING lgr: 62.423%, 2.012%
MINMAX SCALING knn: 59.343%, 1.266%
MINMAX SCALING gnb: 61.875%, 1.943%
MINMAX SCALING svc: 65.024%, 1.116%
MINMAX SCALING dtc: 55.647%, 1.774%
MINMAX SCALING rfc: 61.533%, 0.923%
MINMAX SCALING gbc: 62.971%, 0.589%
MINMAX SCALING mlp: 64.408%, 1.426%
 
STANDARD SCALING lgr: 65.092%, 1.432%
STANDARD SCALING knn: 63.587%, 1.884%
STANDARD SCALING gnb: 62.423%, 1.006%
STANDARD SCALING svc: 66.598%, 0.953%
STANDARD SCALING dtc: 56.400%, 1.839%
STANDARD SCALING rfc: 63.518%, 0.698%
STANDARD SCALING gbc: 65.435%, 1.406%
STANDARD SCALING mlp: 67.762%, 0.731%


In [54]:
#Second method: create new features based on the existing features
win_size = 4
df['mean_precipitation'] = df['precipitation'].rolling(win_size).mean()
df['mean_temp_max'] = df['temp_max'].rolling(win_size).mean()
df['mean_temp'] = df['temp_max'] + df['temp_min'] / 2
df['mean_temp_min'] = df['temp_min'].rolling(win_size).mean()
df['mean_wind'] = df['wind'].rolling(win_size).mean()

print(df.shape)
print(df.head(10))

(1461, 10)
   precipitation  temp_max  temp_min  wind  weather  mean_precipitation  \
0            0.0      12.8       5.0   4.7  drizzle                 NaN   
1           10.9      10.6       2.8   4.5     rain                 NaN   
2            0.8      11.7       7.2   2.3     rain                 NaN   
3           20.3      12.2       5.6   4.7     rain               8.000   
4            1.3       8.9       2.8   6.1     rain               8.325   
5            2.5       4.4       2.2   2.2     rain               6.225   
6            0.0       7.2       2.8   2.3     rain               6.025   
7            0.0      10.0       2.8   2.0      sun               0.950   
8            4.3       9.4       5.0   3.4     rain               1.700   
9            1.0       6.1       0.6   3.4     rain               1.325   

   mean_temp_max  mean_temp  mean_temp_min  mean_wind  
0            NaN       15.3            NaN        NaN  
1            NaN       12.0            NaN        N

In [55]:
df = df.dropna()
df.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,mean_precipitation,mean_temp_max,mean_temp,mean_temp_min,mean_wind
3,20.3,12.2,5.6,4.7,rain,8.0,11.825,15.0,5.15,4.05
4,1.3,8.9,2.8,6.1,rain,8.325,10.85,10.3,4.6,4.4
5,2.5,4.4,2.2,2.2,rain,6.225,9.3,5.5,4.45,3.825
6,0.0,7.2,2.8,2.3,rain,6.025,8.175,8.6,3.35,3.825
7,0.0,10.0,2.8,2.0,sun,0.95,7.625,11.4,2.65,3.15


In [56]:
X_New = df.drop(columns=["weather"])
y_New = df["weather"]

Xs1 = Robust_scl.fit_transform(X_New)
Xs2 = MinMax_scl.fit_transform(X_New)
Xs3 = Standard_scl.fit_transform(X_New)

In [57]:
X_New.shape

(1458, 9)

In [58]:
y_New.shape

(1458,)

In [62]:
for n in models:
    scores = cross_val_score(models[n], Xs1, y_New, cv=kf, n_jobs=-1) #get the accuracy
    print(f"ROBUST SCALING {n}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for i in models:
    scores = cross_val_score(models[i], Xs2, y_New, cv=kf, n_jobs=-1) #get the accuracy
    print(f"MINMAX SCALING {i}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for j in models:
    scores = cross_val_score(models[j], Xs3, y_New, cv=kf, n_jobs=-1) #get the accuracy
    print(f"STANDARD SCALING {j}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy

ROBUST SCALING lgr: 81.001%, 2.695%
ROBUST SCALING knn: 74.623%, 1.561%
ROBUST SCALING gnb: 76.406%, 1.851%
ROBUST SCALING svc: 78.532%, 3.063%
ROBUST SCALING dtc: 77.641%, 1.215%
ROBUST SCALING rfc: 85.802%, 2.064%
ROBUST SCALING gbc: 84.705%, 1.093%
ROBUST SCALING mlp: 83.539%, 2.273%
 
MINMAX SCALING lgr: 73.045%, 1.268%
MINMAX SCALING knn: 69.890%, 0.513%
MINMAX SCALING gnb: 76.406%, 1.851%
MINMAX SCALING svc: 74.143%, 1.588%
MINMAX SCALING dtc: 77.915%, 1.468%
MINMAX SCALING rfc: 85.528%, 1.765%
MINMAX SCALING gbc: 84.636%, 1.305%
MINMAX SCALING mlp: 75.995%, 1.515%
 
STANDARD SCALING lgr: 78.601%, 2.310%
STANDARD SCALING knn: 70.850%, 0.829%
STANDARD SCALING gnb: 76.406%, 1.851%
STANDARD SCALING svc: 75.446%, 1.765%
STANDARD SCALING dtc: 77.984%, 1.848%
STANDARD SCALING rfc: 85.048%, 1.873%
STANDARD SCALING gbc: 84.774%, 1.333%
STANDARD SCALING mlp: 79.698%, 3.049%


In [68]:
#X_New = df.drop(columns=["mean_temp"]) delete column of mean value of temperature
X_New.shape

(1458, 8)

In [69]:
Xs1 = Robust_scl.fit_transform(X_New)
Xs2 = MinMax_scl.fit_transform(X_New)
Xs3 = Standard_scl.fit_transform(X_New)

In [70]:
for n in models:
    scores = cross_val_score(models[n], Xs1, y_New, cv=kf, n_jobs=-1) #get the accuracy
    print(f"ROBUST SCALING {n}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for i in models:
    scores = cross_val_score(models[i], Xs2, y_New, cv=kf, n_jobs=-1) #get the accuracy
    print(f"MINMAX SCALING {i}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy
print(" ")
for j in models:
    scores = cross_val_score(models[j], Xs3, y_New, cv=kf, n_jobs=-1) #get the accuracy
    print(f"STANDARD SCALING {j}: {scores.mean():.3%}, {scores.std():.3%}")#get the mean and standard deviation of accuracy

ROBUST SCALING lgr: 81.001%, 2.695%
ROBUST SCALING knn: 75.240%, 1.080%
ROBUST SCALING gnb: 78.532%, 0.970%
ROBUST SCALING svc: 78.464%, 2.988%
ROBUST SCALING dtc: 78.464%, 1.796%
ROBUST SCALING rfc: 86.008%, 1.680%
ROBUST SCALING gbc: 85.391%, 1.705%
ROBUST SCALING mlp: 83.471%, 2.690%
 
MINMAX SCALING lgr: 72.977%, 1.283%
MINMAX SCALING knn: 70.096%, 0.540%
MINMAX SCALING gnb: 78.326%, 1.119%
MINMAX SCALING svc: 74.211%, 1.448%
MINMAX SCALING dtc: 78.601%, 1.603%
MINMAX SCALING rfc: 85.460%, 1.911%
MINMAX SCALING gbc: 85.322%, 1.781%
MINMAX SCALING mlp: 76.612%, 1.093%
 
STANDARD SCALING lgr: 78.601%, 2.310%
STANDARD SCALING knn: 71.331%, 1.080%
STANDARD SCALING gnb: 78.532%, 0.970%
STANDARD SCALING svc: 75.789%, 1.933%
STANDARD SCALING dtc: 78.532%, 2.141%
STANDARD SCALING rfc: 85.460%, 1.749%
STANDARD SCALING gbc: 85.322%, 1.781%
STANDARD SCALING mlp: 80.521%, 2.731%
