In [None]:
import pandas as pd
import seaborn as sns
import numpy  as np
import matplotlib.pyplot as plt

#load dataset
df=pd.read_csv('weather.csv')

#handle null and duplicate values
#there is no duplicate and null values
df

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14,73,9.5,82,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39,96,8.5,71,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30,64,7.0,16,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38,83,1.5,82,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27,74,17.0,66,overcast,990.67,1,Winter,2.5,mountain,Rainy
...,...,...,...,...,...,...,...,...,...,...,...
13195,10,74,14.5,71,overcast,1003.15,1,Summer,1.0,mountain,Rainy
13196,-1,76,3.5,23,cloudy,1067.23,1,Winter,6.0,coastal,Snowy
13197,30,77,5.5,28,overcast,1012.69,3,Autumn,9.0,coastal,Cloudy
13198,3,76,10.0,94,overcast,984.27,0,Winter,2.0,inland,Snowy


In [None]:
#outliers treatment
from scipy import stats

numeric_col=df.select_dtypes(include='number').columns
z_score=np.abs(stats.zscore(df[numeric_col]))
outliers_mask = (z_score > 3).any(axis=1)

df_outlier=df[outliers_mask]

#remove the outlier

df=df.drop(df_outlier.index)
df.shape

(12151, 11)

In [None]:
#scaling
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
df[numeric_col]=scaler.fit_transform(df[numeric_col])
df

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,-0.310162,0.188334,0.010892,0.915933,partly cloudy,0.266545,-0.471527,Winter,-0.606073,inland,Rainy
1,1.235710,1.356867,-0.156214,0.572759,partly cloudy,0.301365,0.885473,Spring,1.777582,inland,Cloudy
2,0.679196,-0.268918,-0.406874,-1.143110,clear,0.717483,0.342673,Spring,0.127359,mountain,Sunny
3,1.173875,0.696392,-1.325958,0.915933,clear,1.147300,0.885473,Spring,-1.522864,coastal,Sunny
4,0.493692,0.239140,1.264188,0.416771,overcast,-0.883630,-0.742928,Winter,-0.972789,mountain,Rainy
...,...,...,...,...,...,...,...,...,...,...,...
13195,-0.557501,0.239140,0.846423,0.572759,overcast,-0.171263,-0.742928,Summer,-1.522864,mountain,Rainy
13196,-1.237685,0.340751,-0.991745,-0.924727,cloudy,3.486466,-0.742928,Winter,0.310717,coastal,Snowy
13197,0.679196,0.391557,-0.657533,-0.768739,overcast,0.373286,-0.200127,Autumn,1.410866,coastal,Cloudy
13198,-0.990345,0.340751,0.094445,1.290305,overcast,-1.248946,-1.014328,Winter,-1.156148,inland,Snowy


In [None]:
#feature selection (anova==>numeric - object)
from sklearn.feature_selection import SelectKBest,chi2,f_classif
x=df.drop('Weather Type',axis=1)
y=df['Weather Type']

numeric_col_x=x.select_dtypes(include='number').columns

selector=SelectKBest(f_classif,k=5)
selector.fit(x[numeric_col_x],y)
x_anova=df[x.columns[selector.get_support(indices=True)]]
x_anova

Unnamed: 0,Temperature,Humidity,Precipitation (%),Atmospheric Pressure,UV Index
0,-0.310162,0.188334,0.915933,0.266545,-0.471527
1,1.235710,1.356867,0.572759,0.301365,0.885473
2,0.679196,-0.268918,-1.143110,0.717483,0.342673
3,1.173875,0.696392,0.915933,1.147300,0.885473
4,0.493692,0.239140,0.416771,-0.883630,-0.742928
...,...,...,...,...,...
13195,-0.557501,0.239140,0.572759,-0.171263,-0.742928
13196,-1.237685,0.340751,-0.924727,3.486466,-0.742928
13197,0.679196,0.391557,-0.768739,0.373286,-0.200127
13198,-0.990345,0.340751,1.290305,-1.248946,-1.014328


In [None]:
#label encoding
from sklearn.preprocessing import LabelEncoder

y_final=pd.DataFrame(LabelEncoder().fit_transform(y),columns=['Weather Type'])

x_final=pd.concat([x,y_final],axis=1)

object_col_x=x.select_dtypes(include='object').columns

for col in object_col_x:
  label_encoder=LabelEncoder()
  x[col]=label_encoder.fit_transform(x[col])

In [None]:
# feature selection (chi square==>object - object)

selector1=SelectKBest(chi2,k=2)

selector1.fit(x[object_col_x],y)
x_chi=x[x[object_col_x].columns[selector1.get_support(indices=True)]]


#final x
x_final=x_anova.join(x_chi)
x_final

# final y
y_final=pd.Series(y_final['Weather Type'])


In [34]:
# model building with cross validation

from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression

model=LogisticRegression()

# calculate the cross validation score
cross_score=cross_val_score(model,x_final,y_final,cv=5)
print("Cross validation score",cross_score)
print("Mean of corss validation score",np.mean(cross_score))

#model prediction and evaluation

y_pred=cross_val_predict(model,x_final,y_final,cv=5)
print("\nAccuracy score",accuracy_score(y_final,y_pred))
print('\nclassification report\n')
print(classification_report(y_final,y_pred))
print('confusion matrix')
print(confusion_matrix  (y_final,y_pred))

Cross validation score [0.87083505 0.89341564 0.89012346 0.88436214 0.88024691]
Mean of corss validation score 0.8837966390907568

Accuracy score 0.883795572380874

classification report

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      3101
           1       0.83      0.86      0.85      2978
           2       0.95      0.97      0.96      3027
           3       0.91      0.86      0.88      3045

    accuracy                           0.88     12151
   macro avg       0.88      0.88      0.88     12151
weighted avg       0.88      0.88      0.88     12151

confusion matrix
[[2642  303   34  122]
 [ 240 2550   90   98]
 [  21   18 2939   49]
 [ 216  186   35 2608]]
