<a href="https://colab.research.google.com/github/leninworld/lights_research_notebook_templates/blob/main/machine_learning_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Basic code snippets and workflow


In [None]:
# mounting colab and drive
from google.colab import drive
# Note: this will pop up asking for google login permission
drive.mount('/content/drive')

In [None]:
# linux command to list the files under linux running with ! at the start of the command
# colab Jupyter notebook (Prints dir/files/links in your Drive)
!ls -ltr /content/drive/MyDrive/ | grep *.csv

In [None]:
# check for specific data from drive
!ls -ltr /content/drive/MyDrive/

In [None]:
# importing required library

import warnings
import traceback
import numpy as np
%matplotlib inline
import pandas as pd
import seaborn as sns
from sklearn import svm
from sklearn.svm import SVC
from sklearn import metrics
from tabulate import tabulate
from sklearn import naive_bayes
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.utils import class_weight
warnings.filterwarnings(action='ignore')
from sklearn.metrics import recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics  import f1_score,accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# creating pretty print function

n=2
def pretty_print(df,n)
print(tabulate(df.head(n), headers='keys', tablefmt='psql'))

In [None]:
# reading dataset from drive

df = pd.read_csv('/content/drive/MyDrive/')
pretty_print(df,1)

# check the size and shape of the dataset
print('Shape', df.shape)
print('Size', df.size)

In [None]:
# check the data types and info of the data
df.info()
df.dtypes

In [None]:
# data statistics

df.describe().T

In [None]:
# drop columns

df = df.drop(columns=['Column1', 'Column2','Column3','Column4'])
pretty_print(df1,1)
print("Shape:", df.shape)

In [None]:
# data Preprocessing

# text data included with the numeric data(Counties).
# So we need to encode that in some numeric form before splitting the train test data

df_new = df.copy()
pretty_print(df,1)

In [None]:
# Ordinalencoding on multiple columns conversion of categorical to numeric values without labels

enc = OrdinalEncoder()
enc.fit(df_explode_countyids[["Col4","Col5", "Col6"]])
df_explode_countyids[["Col4","Col5", "Col6"]] = enc.transform(df_explode_countyids[["Col4","Col5", "Col6"]])
pretty_print(df_explode_countyids, 10)

In [None]:
# LabelEncoding , is used only once on single column of dataframe, to use particular column as label.

label_encoder = LabelEncoder()

df_new ['<label>'] = label_encoder.fit_transform(df_explode_countyids['<label>'])
pretty_print(df_new,2)

In [None]:
# helper function - convert to list having only numbers

def convert_list(row):
  mList = [int(e) if e.isdigit() else e for e in str(row['Ids']).split(',')]
  return mList

# create new column to store value to str or int by passing function on dataframe
df_new['Ids_new'] = df_new.apply(convert_list, axis=1)
df_new = df_new.explode('Ids_new')
#pretty_print(df_explode_countyids, 2)
df_new.dtypes

In [None]:
# helper function - function for changing the dtype of object to numeric dtype

def fill_na_0(row):
  if str(row['Ids_new']).isnumeric():
   return int(row['Ids_new'])
  else:
   return 0

# creating new column
df_new['new_Ids'] = df_new.apply(fill_na_0, axis=1)
pretty_print(df_new,1)
df_new.dtypes

In [None]:
# dropping non-numeric data type columns

df_new = df_new.drop(columns=['Ids','Ids_new'])
# pretty_print(df_explode_countyids,2)
df_new.dtypes

In [None]:
# created new function for lambda for label column
m = df_new['<label>'].mean()
print("mean",m)

sd = df_new['<label>'].std()
print("standard deviation",sd)

def lambda_dup(df_new):
  if (df_new['<label>'] == 0):
        return 0
  elif (df_new['<label>'] <= m + 1*sd):
        return 1
  elif (df_new['<label>'] <= m+ 2*sd):
        return 2
  elif (df_new['<label>'] <= m + 3*sd):
        return 3
  else:
        return 0

In [None]:
#Applying lambda function on dataframe & creating new column named lambda_dup_num

df_new['lambda_dup_num'] = df_new.apply(lambda_dup, axis=1)
pretty_print(df_new,1)
df_new.dtypes

In [None]:
# applying lambda function on new column

def ordinal_encoding(df_new,column,ordering):
  df_new = df_new.copy()
  df_new[column] = df_new[column].apply(lambda_dup)
  return df_new

In [None]:
# function for classification or regression

def preprocessing(df_new,task):
  df_new=df_new.copy()

  if task=='Regression':
    Y=df_new['lambda_dup_num'] # label/target
  elif task=='Classification':
    Y=df_new['lambda_dup_num'] # label/target

  # drop columns having label/target
  X=df_new.drop(['lambda_dup_num','AcresBurned'],axis=1)

  # train-test set splitting
  X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.65,shuffle=True,random_state=1)

  # apply standard scalar on training set
  scaler=StandardScaler()
  scaler.fit(X_train)
  # standard scalar transformation
  X_train=pd.DataFrame(scaler.transform(X_train),columns=X.columns)
  X_test=pd.DataFrame(scaler.transform(X_test),columns=X.columns)
  return X_train,X_test,Y_train,Y_test

In [None]:
# before passing data to model create test and train data points

X_train, X_test, Y_train, Y_test = preprocessing(df_explode_countyids, task='Classification')
X_train.head(1)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
# checking values
df_new["lambda_dup_num"].value_counts()

In [None]:
# saving dataframe to new variable
dfc = df_new.copy()
#print(dfc)

##Classification Code (boilerplate)

In [None]:
# Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score, cross_validate

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model
log_reg_model.fit(X_train, Y_train)

# Make predictions
Y_pred_log_reg = log_reg_model.predict(X_test)

# Perform cross-validation
cv_results = cross_validate(log_reg_model, X_train, Y_train, cv=10)
print("Logistic Regression Cross-Validation Results:", cv_results)

# Print classification report
print(classification_report(Y_test, Y_pred_log_reg, labels=[0, 1, 2, 3], target_names=["no_fire", "low_fire", "moderate_fire", "high_fire"]))

# Print balanced accuracy score
print("Logistic Regression Balanced Accuracy Score:", balanced_accuracy_score(Y_test, Y_pred_log_reg))

print("Test run 1")


In [None]:
# Machine Learning Algorithm  => Support Vector Machine Classifiers

svm = SVC(C=0.2, kernel='linear', gamma='auto', class_weight='balanced', max_iter=1000)
svm.fit(X_train, Y_train)
Y_pred = svm.predict(X_test)
print(cross_validate(svm, X_train, Y_train, cv=10))
print(classification_report(Y_test, Y_pred, labels=[0,1,2,3], target_names=["no_fire", "low_fire", "moderate_fire","high_fire"]))
print(metrics.balanced_accuracy_score(Y_test, Y_pred))
print("Test run 1")
print("The C value is 0.2 and max_iter is 1000")

In [None]:
# Machine Learning Algorithm  => Naive Bayes model Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, balanced_accuracy_score, cross_validate

# Initialize the Naive Bayes model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, Y_train)

# Make predictions
Y_pred_nb = nb_model.predict(X_test)

# Perform cross-validation
cv_results = cross_validate(nb_model, X_train, Y_train, cv=10)
print("Naive Bayes Cross-Validation Results:", cv_results)

# Print classification report
print(classification_report(Y_test, Y_pred_nb, labels=[0, 1, 2, 3], target_names=["no_fire", "low_fire", "moderate_fire", "high_fire"]))

# Print balanced accuracy score
print("Naive Bayes Balanced Accuracy Score:", balanced_accuracy_score(Y_test, Y_pred_nb))

print("Test run 1")


In [None]:
# Machine Learning Algorithm  => Random Forest model Classifiers

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score, cross_validate

# Initialize the Random Forest model
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(X_train, Y_train)

# Make predictions
Y_pred_rf = rf_model.predict(X_test)

# Perform cross-validation
cv_results = cross_validate(rf_model, X_train, Y_train, cv=10)
print("Random Forest Cross-Validation Results:", cv_results)

# Print classification report
print(classification_report(Y_test, Y_pred_rf, labels=[0, 1, 2, 3], target_names=["no_fire", "low_fire", "moderate_fire", "high_fire"]))

# Print balanced accuracy score
print("Random Forest Balanced Accuracy Score:", balanced_accuracy_score(Y_test, Y_pred_rf))

print("Test run 1")


In [None]:
# Machine Learning Algorithm  => Multi Layer Perceptron (MLP) Classifer

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score, cross_validate

# Initialize the MLP model
mlp_model = MLPClassifier(max_iter=1000)

# Train the model
mlp_model.fit(X_train, Y_train)

# Make predictions
Y_pred_mlp = mlp_model.predict(X_test)

# Perform cross-validation
cv_results = cross_validate(mlp_model, X_train, Y_train, cv=10)
print("MLP Cross-Validation Results:", cv_results)

# Print classification report
print(classification_report(Y_test, Y_pred_mlp, labels=[0, 1, 2, 3], target_names=["no_fire", "low_fire", "moderate_fire", "high_fire"]))

# Print balanced accuracy score
print("MLP Balanced Accuracy Score:", balanced_accuracy_score(Y_test, Y_pred_mlp))

print("Test run 1")

## Regression code (boilerplate)

In [None]:
# read data and perform train-test split
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your dataset
# Replace 'your_dataset.csv' with the path to your dataset file
data = pd.read_csv('your_dataset.csv')

# Assuming your dataset has features (X) and target variable (y)
X = data.drop(columns=['target_column'])  # replace 'target_column' with the name of your target variable
y = data['target_column']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Best alpha for Ridge Regression

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

ridge = Ridge()   # HT for Ridge
param_grid_ridge = {'alpha': [0.1, 1, 10]}
ridge_cv = GridSearchCV(estimator=ridge, param_grid=param_grid_ridge, scoring='neg_mean_squared_error', cv=5)
ridge_cv.fit(X_train, y_train)

print("Best alpha for Ridge Regression:", ridge_cv.best_params_)

#### Linear Regression

In [None]:
# Linear Regression Model

# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Calculate R² Score
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

#### Support Vector Regression (SVR)

In [None]:
# Best C for Support Vector Regression (SVR) model
svr = SVR(kernel='linear')   #HT for SVR
param_grid_svr = {'C': [0.1, 1, 10]}
svr_cv = GridSearchCV(estimator=svr, param_grid=param_grid_svr, scoring='neg_mean_squared_error', cv=5)
svr_cv.fit(X_train, y_train)

print("Best C for SVR:", svr_cv.best_params_)

In [None]:
# Support Vector Regression (SVR) model

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize the SVR model
svr_model = SVR()

# Train the model
svr_model.fit(X_train, y_train)

# Make predictions
y_pred_svr = svr_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse_svr = mean_squared_error(y_test, y_pred_svr)
print("SVR - Mean Squared Error:", mse_svr)

# Calculate Root Mean Squared Error (RMSE)
rmse_svr = np.sqrt(mse_svr)
print("SVR - Root Mean Squared Error:", rmse_svr)

# Calculate Mean Absolute Error (MAE)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
print("SVR - Mean Absolute Error:", mae_svr)

# Calculate R² Score
r2_svr = r2_score(y_test, y_pred_svr)
print("SVR - R² Score:", r2_svr)

#### Random Forest Regression

In [None]:
# Random Forest Regression

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize the Random Forest model
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Random Forest - Mean Squared Error:", mse_rf)

# Calculate Root Mean Squared Error (RMSE)
rmse_rf = np.sqrt(mse_rf)
print("Random Forest - Root Mean Squared Error:", rmse_rf)

# Calculate Mean Absolute Error (MAE)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("Random Forest - Mean Absolute Error:", mae_rf)

# Calculate R² Score
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest - R² Score:", r2_rf)

#### Multi Layer Perceptron (MLP) Regression

In [None]:
# Multi Layer Perceptron Regression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize the MLP model
mlp_model = MLPRegressor(max_iter=500)

# Train the model
mlp_model.fit(X_train, y_train)

# Make predictions
y_pred_mlp = mlp_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print("MLP - Mean Squared Error:", mse_mlp)

# Calculate Root Mean Squared Error (RMSE)
rmse_mlp = np.sqrt(mse_mlp)
print("MLP - Root Mean Squared Error:", rmse_mlp)

# Calculate Mean Absolute Error (MAE)
mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
print("MLP - Mean Absolute Error:", mae_mlp)

# Calculate R² Score
r2_mlp = r2_score(y_test, y_pred_mlp)
print("MLP - R² Score:", r2_mlp)

In [None]:
# Multi Layer Perceptron (MLP) Regressor with hyper parameter tuning


mlp = MLPRegressor()

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}


mlp_cv = GridSearchCV(estimator=mlp, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
mlp_cv.fit(X_train, y_train)


best_params = mlp_cv.best_params_
best_model = mlp_cv.best_estimator_
print("Best parameters for MLPRegressor:", best_params)


# Prediction

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MLPRegressor MSE on test set:", mse)

#### K Nearest Neighbor Regressor

In [None]:
# Grid Search for K Nearest Neighbor Regressor

knn = KNeighborsRegressor()

param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}


knn_cv = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
knn_cv.fit(X_train, y_train)

best_params = knn_cv.best_params_
best_model = knn_cv.best_estimator_
print("Best parameters for KNeighborsRegressor:", best_params)

# Prediction
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("KNeighborsRegressor MSE on test set:", mse)

#### XGBoost

In [None]:
# XGBoost with hyper parameter tuning

xgb_reg = xgb.XGBRegressor()

param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'alpha': [0, 0.1, 1],
    'lambda': [0, 0.1, 1]
}


xgb_cv = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
xgb_cv.fit(X_train, y_train)

best_params = xgb_cv.best_params_
best_model = xgb_cv.best_estimator_
print("Best parameters for XGBoost:", best_params)


y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("XGBoost MSE on test set:", mse)


#### Decision Tree Regressor

In [1]:
# Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)

NameError: name 'X_train' is not defined