In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Importing the preprocessed dataset and scaling it**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/dataset/weatherAUS_preprocessed.csv')

# Split the data into features and target
X = data.drop('RainTomorrow', axis=1)
y = data['RainTomorrow']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Logistic Regression VS Naive Bayes**

Baseline accuracy for the linear models:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Initialize the classifiers
log_reg = LogisticRegression(random_state=42, solver='saga', max_iter=1000)
nb = GaussianNB()

# Train Logistic Regression
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")

# Train Naive Bayes
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.8450433108758422
Naive Bayes Accuracy: 0.6987487969201155


Hyperparameter tuning for logistic regression:

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
log_reg_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Create Grid Search CV for Logistic Regression
log_reg_grid_search = GridSearchCV(LogisticRegression(random_state=42), log_reg_param_grid, cv=5, scoring='accuracy')
log_reg_grid_search.fit(X_train, y_train)

# Best parameters and reevaluation
print(f"Best parameters for Logistic Regression: {log_reg_grid_search.best_params_}")
best_log_reg = log_reg_grid_search.best_estimator_
y_pred_best_log_reg = best_log_reg.predict(X_test)
print(f"Logistic Regression Accuracy after tuning: {accuracy_score(y_test, y_pred_best_log_reg)}")




Best parameters for Logistic Regression: {'C': 100, 'solver': 'liblinear'}
Logistic Regression Accuracy after tuning: 0.8450433108758422


Hyperparameter tuning for Naive bayes:

In [None]:
# Naive Bayes usually requires less tuning:
nb_param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7]}

nb_grid_search = GridSearchCV(GaussianNB(), nb_param_grid, cv=5, scoring='accuracy')
nb_grid_search.fit(X_train, y_train)

print(f"Best parameters for Naive Bayes: {nb_grid_search.best_params_}")
best_nb = nb_grid_search.best_estimator_
y_pred_best_nb = best_nb.predict(X_test)
print(f"Naive Bayes Accuracy after tuning: {accuracy_score(y_test, y_pred_best_nb)}")


Best parameters for Naive Bayes: {'var_smoothing': 1e-09}
Naive Bayes Accuracy after tuning: 0.6987487969201155


**Decision Tree VS Neural Networks**

Baseline accuracy for the non-linear models:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt)}")

from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(random_state=42, max_iter=1000)
nn.fit(X_train, y_train)  # Ensure X_train is scaled

y_pred_nn = nn.predict(X_test)
print(f"Neural Network Accuracy: {accuracy_score(y_test, y_pred_nn)}")


Decision Tree Accuracy: 0.7812457032861267
Neural Network Accuracy: 0.8374810944589578


Hyperparameter tuning for Decision Tree:

In [None]:
from sklearn.model_selection import GridSearchCV

dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)

print(f"Best parameters for Decision Tree: {dt_grid_search.best_params_}")



Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}


Hyperparameter tuning for Neural networks:

In [None]:
nn_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50), (100,100)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
}

nn_grid_search = GridSearchCV(MLPClassifier(random_state=42, max_iter=1000), nn_param_grid, cv=5, scoring='accuracy')
nn_grid_search.fit(X_train, y_train)

print(f"Best parameters for Neural Network: {nn_grid_search.best_params_}")




Accuracy for Decision Tree and Neural Networks after Hyperparameter tuning:

In [None]:
# Using the best estimator from the grid search
best_dt = dt_grid_search.best_estimator_
y_pred_best_dt = best_dt.predict(X_test)
print(f"Decision Tree Accuracy after tuning: {accuracy_score(y_test, y_pred_best_dt)}")


Decision Tree Accuracy after tuning: 0.836381135707411


In [None]:
best_nn = nn_grid_search.best_estimator_
y_pred_best_nn = best_nn.predict(X_test)
print(f"Neural Network Accuracy after tuning: {accuracy_score(y_test, y_pred_best_nn)}")

**Hybrid model: SVM VS Bayesian Netwok**

Baseline accuracy for SVM:

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm = SVC(random_state=42)
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm)}")


In [None]:
!pip install pomegranate
!pip install pgmpy

Collecting pomegranate
  Downloading pomegranate-1.0.4-py3-none-any.whl (92 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/92.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m92.2/92.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.4/92.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting apricot-select>=0.6.1 (from pomegranate)
  Downloading apricot-select-0.6.1.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from apricot-select>=0.6.1->pomegranate)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9.0->pomegranate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collec

Baseline accuracy of Byesian Network:

In [None]:
from pomegranate import BayesianNetwork
from sklearn.metrics import accuracy_score
import numpy as np

X_train_bn = np.array(X_train)
y_train_bn = np.array(y_train).reshape(-1, 1)
data_bn = np.concatenate((X_train_bn, y_train_bn), axis=1)

# Train a Bayesian Network (assuming the structure has been learnt already)
# Structure learning can be done with pomegranate or manually specified based on domain knowledge
bn = BayesianNetwork.from_samples(data_bn, algorithm='exact')
# Prediction and accuracy calculation for Bayesian Networks can be more involved
# You might need to manually compare predicted and actual values based on your network's structure and output


Hyperparameter tuning for SVM:

In [None]:
from sklearn.model_selection import GridSearchCV

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear', 'poly']
}

svm_grid_search = GridSearchCV(SVC(random_state=42), svm_param_grid, cv=5, scoring='accuracy')
svm_grid_search.fit(X_train, y_train)

print(f"Best parameters for SVM: {svm_grid_search.best_params_}")


Accuracy of SVM after Hyperparameter tuning:

In [None]:
best_svm = svm_grid_search.best_estimator_
y_pred_best_svm = best_svm.predict(X_test)
print(f"SVM Accuracy after tuning: {accuracy_score(y_test, y_pred_best_svm)}")
