In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from time import time

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")
scaler = StandardScaler()



In [2]:
# Load data from CSV file
df = pd.read_csv('modified_weatherAUS.csv')
print(df)

            id        Date Location  MinTemp  MaxTemp  Rainfall WindGustDir  \
0            1  2008-12-01   Albury     13.4     22.9       0.6           W   
1            2  2008-12-02   Albury      7.4     25.1       0.0         WNW   
2            3  2008-12-03   Albury     12.9     25.7       0.0         WSW   
3            4  2008-12-04   Albury      9.2     28.0       0.0          NE   
4            5  2008-12-05   Albury     17.5     32.3       1.0           W   
...        ...         ...      ...      ...      ...       ...         ...   
112955  112956  2017-06-20    Uluru      3.5     21.8       0.0           E   
112956  112957  2017-06-21    Uluru      2.8     23.4       0.0           E   
112957  112958  2017-06-22    Uluru      3.6     25.3       0.0         NNW   
112958  112959  2017-06-23    Uluru      5.4     26.9       0.0           N   
112959  112960  2017-06-24    Uluru      7.8     27.0       0.0          SE   

        WindGustSpeed WindDir9am WindDir3pm  ...  P

In [3]:
# Select features and target
features = ['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustDir', 'WindGustSpeed', 
            'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 
            'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday',
            'AvgTemp', 'AvgWind', 'AvgRainfall', 'AvgHumidity', 'AvgPressure']
target = 'RainTomorrow'
X = df[features]
y = df[target]
X
y

0         No
1         No
2         No
3         No
4         No
          ..
112955    No
112956    No
112957    No
112958    No
112959    No
Name: RainTomorrow, Length: 112960, dtype: object

In [4]:
# Convert categorical features to numerical using LabelEncoder
categorical_features = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
encoders = {}
for feature in categorical_features:
    encoders[feature] = LabelEncoder()
    X[feature] = encoders[feature].fit_transform(X[feature])



In [5]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train
X_test
y_train
y_test

2271       No
99471      No
10025      No
31065      No
15044      No
         ... 
104759    Yes
94672     Yes
34203     Yes
112605    Yes
101040     No
Name: RainTomorrow, Length: 22592, dtype: object

In [6]:

# Scale numeric features using StandardScaler
scaler = StandardScaler()
numeric_features = list(set(features) - set(categorical_features))
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Train decision tree classifier and predict on test set
dt_model = DecisionTreeClassifier(random_state=42)
start_time = time()
dt_model.fit(X_train, y_train)
end_time = time()
print('Time taken to construct decision tree model:', end_time - start_time, 'seconds')
dt_predict = dt_model.predict(X_test)
print('Classification report for decision tree model:')
print(classification_report(y_test, dt_predict))
print('Confusion matrix for decision tree model:')
print(confusion_matrix(y_test, dt_predict))

# Create gradient boosting model
gb_model = GradientBoostingClassifier(random_state=42)
start_time = time()
gb_model.fit(X_train, y_train)
end_time = time()
print('Time taken to construct gradient boosting model:', end_time - start_time, 'seconds')
gb_predict = gb_model.predict(X_test)
print('Classification report for gradient boosting model:')
print(classification_report(y_test, gb_predict))
print('Confusion matrix for gradient boosting model:')
print(confusion_matrix(y_test, gb_predict))

# Create random forest model
rf_model = RandomForestClassifier(random_state=42)
start_time = time()
rf_model.fit(X_train, y_train)
end_time = time()
print('Time taken to construct random forest model:', end_time - start_time, 'seconds')
rf_predict = rf_model.predict(X_test)
print('Classification report for random forest model:')
print(classification_report(y_test, rf_predict))
print('Confusion matrix for random forest model:')
print(confusion_matrix(y_test, rf_predict))


Time taken to construct decision tree model: 1.1061902046203613 seconds
Classification report for decision tree model:
              precision    recall  f1-score   support

          No       0.87      0.86      0.86     17715
         Yes       0.51      0.54      0.52      4877

    accuracy                           0.79     22592
   macro avg       0.69      0.70      0.69     22592
weighted avg       0.79      0.79      0.79     22592

Confusion matrix for decision tree model:
[[15148  2567]
 [ 2230  2647]]
Time taken to construct gradient boosting model: 17.86007308959961 seconds
Classification report for gradient boosting model:
              precision    recall  f1-score   support

          No       0.87      0.95      0.91     17715
         Yes       0.73      0.51      0.60      4877

    accuracy                           0.85     22592
   macro avg       0.80      0.73      0.75     22592
weighted avg       0.84      0.85      0.84     22592

Confusion matrix for gradien

In [7]:
# Evaluate accuracy
dt_acc = accuracy_score(y_test, dt_predict)
gb_acc = accuracy_score(y_test, gb_predict)
rf_acc = accuracy_score(y_test, rf_predict)

print('decision tree accuracy:', dt_acc)
print('gradient boosting accuracy:', gb_acc)
print('random forest accuracy:', rf_acc)


decision tree accuracy: 0.7876682011331445
gradient boosting accuracy: 0.8529567988668555
random forest accuracy: 0.8560995042492918
