In [2]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# import classification_report
from sklearn.metrics import classification_report

#Load data
df = pd.read_csv('../data/processed/weather_classification_data.csv')

In [3]:
# OHE
non_numerical_columns = df.select_dtypes(include='object').columns
# Drop the target column
non_numerical_columns = non_numerical_columns.drop('weather_type')
df = pd.get_dummies(df, columns=non_numerical_columns, drop_first=True)

In [4]:
# Show the percentages of the target column
print(df['weather_type'].value_counts(normalize=True))

weather_type
Rainy     0.25
Cloudy    0.25
Sunny     0.25
Snowy     0.25
Name: proportion, dtype: float64


The target column is balanced

In [5]:
# Split the data into X and y, target is `weather_type`
X = df.drop('weather_type', axis=1)
y = df['weather_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# check the shape of the resulting dataframes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10560, 15), (2640, 15), (10560,), (2640,))

In [6]:
# Global variables
N_ESTIMATORS = 1000
MAX_LEAF_NODES = 32

In [7]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_leaf_nodes=MAX_LEAF_NODES, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

In [8]:
# Predict

y_pred = rnd_clf.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9132575757575757

## Standarizado

In [9]:
# 2. Estandarizando the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

# Transform both the train and test data
X_train_standarized = scaler.transform(X_train)
X_test_standarized = scaler.transform(X_test)

rnd_clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_leaf_nodes=MAX_LEAF_NODES, n_jobs=-1)
rnd_clf.fit(X_train_standarized, y_train)

# Predict

y_pred_standar = rnd_clf.predict(X_test_standarized)

# Evaluate
accuracy_score(y_test, y_pred_standar)

KeyboardInterrupt: 

In [None]:
print(classification_report(y_test, y_pred_standar))

              precision    recall  f1-score   support

      Cloudy       0.87      0.89      0.88       651
       Rainy       0.90      0.91      0.91       647
       Snowy       0.98      0.93      0.95       701
       Sunny       0.90      0.92      0.91       641

    accuracy                           0.91      2640
   macro avg       0.91      0.91      0.91      2640
weighted avg       0.91      0.91      0.91      2640



## Normalizado

In [None]:
# Normalizando the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

# Transform both the train and test data
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)

rnd_clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_leaf_nodes=MAX_LEAF_NODES, n_jobs=-1)
rnd_clf.fit(X_train_normalized, y_train)

# Predict

y_pred_norm = rnd_clf.predict(X_test_normalized)

# Evaluate
accuracy_score(y_test, y_pred_norm)

0.9128787878787878

In [None]:
print(classification_report(y_test, y_pred_norm))

              precision    recall  f1-score   support

      Cloudy       0.87      0.89      0.88       651
       Rainy       0.90      0.91      0.91       647
       Snowy       0.97      0.93      0.95       701
       Sunny       0.91      0.92      0.91       641

    accuracy                           0.91      2640
   macro avg       0.91      0.91      0.91      2640
weighted avg       0.91      0.91      0.91      2640

