In [22]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [23]:
data=pd.read_csv("weather.csv")

In [24]:
data

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,01-01-2012,0.0,12.8,5.0,4.7,drizzle
1,02-01-2012,10.9,10.6,2.8,4.5,rain
2,03-01-2012,0.8,11.7,7.2,2.3,rain
3,04-01-2012,20.3,12.2,5.6,4.7,rain
4,05-01-2012,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
2917,27-12-2015,8.6,4.4,1.7,2.9,rain
2918,28-12-2015,1.5,5.0,1.7,1.3,rain
2919,29-12-2015,0.0,7.2,0.6,2.6,fog
2920,30-12-2015,0.0,5.6,-1.0,3.4,sun


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           2922 non-null   object 
 1   precipitation  2922 non-null   float64
 2   temp_max       2922 non-null   float64
 3   temp_min       2922 non-null   float64
 4   wind           2922 non-null   float64
 5   weather        2922 non-null   object 
dtypes: float64(4), object(2)
memory usage: 137.1+ KB


In [26]:
data.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,01-01-2012,0.0,12.8,5.0,4.7,drizzle
1,02-01-2012,10.9,10.6,2.8,4.5,rain
2,03-01-2012,0.8,11.7,7.2,2.3,rain
3,04-01-2012,20.3,12.2,5.6,4.7,rain
4,05-01-2012,1.3,8.9,2.8,6.1,rain


In [27]:
data.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [28]:
data.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,2922.0,2922.0,2922.0,2922.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.679051,7.3485,5.022144,1.437579
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [29]:
data.shape

(2922, 6)

In [30]:
data['weather'].value_counts()

weather
rain       1282
sun        1280
fog         202
drizzle     106
snow         52
Name: count, dtype: int64

In [31]:
X = data.drop(['weather'], axis=1)
Y = data['weather']

In [32]:
X

Unnamed: 0,date,precipitation,temp_max,temp_min,wind
0,01-01-2012,0.0,12.8,5.0,4.7
1,02-01-2012,10.9,10.6,2.8,4.5
2,03-01-2012,0.8,11.7,7.2,2.3
3,04-01-2012,20.3,12.2,5.6,4.7
4,05-01-2012,1.3,8.9,2.8,6.1
...,...,...,...,...,...
2917,27-12-2015,8.6,4.4,1.7,2.9
2918,28-12-2015,1.5,5.0,1.7,1.3
2919,29-12-2015,0.0,7.2,0.6,2.6
2920,30-12-2015,0.0,5.6,-1.0,3.4


In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=70)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(2337, 5) (585, 5) (2337,) (585,)


In [34]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [35]:
                                                                 # Define columns and transformers for preprocessing
numeric_features = ['precipitation','temp_max','temp_min','wind']
categorical_features = ['date']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Replace missing values with median
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),  # One-hot encode categorical variables
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ])

In [36]:
from sklearn.tree import DecisionTreeClassifier

dt= Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier(random_state=42))])

# Train the classifier
dt.fit(X_train, Y_train)
Y_pred = dt.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display classification report (precision, recall, F1-score)
print("Classification Report:")
print(classification_report(Y_test, Y_pred)) 

Accuracy: 96.58%
Classification Report:
              precision    recall  f1-score   support

     drizzle       0.93      0.86      0.89        29
         fog       0.94      0.72      0.82        43
        rain       0.98      1.00      0.99       244
        snow       1.00      0.83      0.91        12
         sun       0.96      0.99      0.97       257

    accuracy                           0.97       585
   macro avg       0.96      0.88      0.92       585
weighted avg       0.97      0.97      0.96       585



In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Update the classifier in the pipeline to RandomForestClassifier
rf= Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', RandomForestClassifier(random_state=42))])

# Train the classifier
rf.fit(X_train, Y_train)

# Make predictions
Y_pred = rf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display classification report (precision, recall, F1-score)
print("Classification Report:")
print(classification_report(Y_test, Y_pred))


Accuracy: 96.24%
Classification Report:
              precision    recall  f1-score   support

     drizzle       1.00      0.72      0.84        29
         fog       1.00      0.72      0.84        43
        rain       0.99      1.00      1.00       244
        snow       1.00      0.83      0.91        12
         sun       0.93      1.00      0.96       257

    accuracy                           0.96       585
   macro avg       0.98      0.86      0.91       585
weighted avg       0.96      0.96      0.96       585



In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Update the classifier in the pipeline to KNeighborsClassifier
knn = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', KNeighborsClassifier(n_neighbors=5))])

# Train the classifier
knn.fit(X_train, Y_train)

# Make predictions
Y_pred = knn.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display classification report (precision, recall, F1-score)
print("Classification Report:")
print(classification_report(Y_test, Y_pred))


Accuracy: 80.17%
Classification Report:
              precision    recall  f1-score   support

     drizzle       0.33      0.21      0.26        29
         fog       0.32      0.16      0.22        43
        rain       0.88      0.90      0.89       244
        snow       1.00      0.42      0.59        12
         sun       0.80      0.90      0.85       257

    accuracy                           0.80       585
   macro avg       0.67      0.52      0.56       585
weighted avg       0.78      0.80      0.78       585



In [39]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Update the classifier in the pipeline to SVC
svc= Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', SVC(kernel='linear', random_state=42))])

# Train the classifier
svc.fit(X_train, Y_train)

# Make predictions
Y_pred = svc.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display classification report (precision, recall, F1-score)
print("Classification Report:")
print(classification_report(Y_test, Y_pred))


Accuracy: 95.56%
Classification Report:
              precision    recall  f1-score   support

     drizzle       1.00      0.66      0.79        29
         fog       1.00      0.70      0.82        43
        rain       0.99      1.00      0.99       244
        snow       1.00      0.75      0.86        12
         sun       0.92      1.00      0.96       257

    accuracy                           0.96       585
   macro avg       0.98      0.82      0.88       585
weighted avg       0.96      0.96      0.95       585



In [40]:
data

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,01-01-2012,0.0,12.8,5.0,4.7,drizzle
1,02-01-2012,10.9,10.6,2.8,4.5,rain
2,03-01-2012,0.8,11.7,7.2,2.3,rain
3,04-01-2012,20.3,12.2,5.6,4.7,rain
4,05-01-2012,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
2917,27-12-2015,8.6,4.4,1.7,2.9,rain
2918,28-12-2015,1.5,5.0,1.7,1.3,rain
2919,29-12-2015,0.0,7.2,0.6,2.6,fog
2920,30-12-2015,0.0,5.6,-1.0,3.4,sun


In [42]:
new_data = {
            'date': '01-01-2012',
            'precipitation':0.0,
           'temp_max':12.8,
            'temp_min': 5.0,
           'wind':4.7
           }

# Create a DataFrame from the input data
new_df = pd.DataFrame([new_data])

# Use the trained model to make predictions
prediction = rf.predict(new_df)
prediction[0]

'drizzle'

In [43]:
new_data = {
            'date': '02-01-2012	',
            'precipitation':10.9,
           'temp_max':10.6,
            'temp_min': 2.8,
           'wind':4.5
           }

# Create a DataFrame from the input data
new_df = pd.DataFrame([new_data])

# Use the trained model to make predictions
prediction = rf.predict(new_df)
prediction[0]

'rain'

In [45]:
new_data = {
            'date': '29-12-2015	',
            'precipitation':0.0,
           'temp_max':7.2,
            'temp_min':0.6,
           'wind':2.6
           }

# Create a DataFrame from the input data
new_df = pd.DataFrame([new_data])

# Use the trained model to make predictions
prediction = rf.predict(new_df)
prediction[0]

'sun'

In [46]:
import pickle
# Dump the trained xgb  with Pickle
rf_pkl_filename = 'rf.pkl'
# Open the file to save as pkl file
rf_Model_pkl = open(rf_pkl_filename, 'wb')
pickle.dump(rf, rf_Model_pkl)
# Close the pickle instances
rf_Model_pkl.close() 