# ML_Algorithms_on_refined_DataFrame

#### ML algorithms mostly either fall in classification or regression. 
#### So apply algorithm and finding best parameter is bit difficult
#### run algorithm for all possible parametrs and find the best algorithm and its parameters

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
from datetime import datetime
import os
import pickle

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.metrics import classification_report


from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [3]:
df = pd.read_csv("data/cleaned_data.csv")

In [4]:
X = df.drop(columns=['price_range'])
y = df['price_range']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## You can add algorithms into the list as you want
## You can experiment with the fine tuning parameters to get best results

In [17]:
class ModelTrainer:
    def __init__(self):
        pass
    
    def save_best_model(self, model, filename):
        timestamp = datetime.now().strftime('%d-%m-%y_%H:%M')
        folder_path = os.path.join(os.getcwd(), 'Trained_models')
        os.makedirs(folder_path, exist_ok=True)

        full_path = os.path.join(folder_path, f'{filename}_{timestamp}.pkl')
        
        with open(full_path, 'wb') as model_file:
            pickle.dump(model, model_file)

    def test_classification_algorithms(self, X_train, y_train, X_test, y_test, classifiers_to_run=None):
        if classifiers_to_run is None:
            classifiers_to_run = ['RandomForest', 'SVM', 'KNN', 'LogisticRegression', 'DecisionTree', 'NaiveBayes']

        results_df = pd.DataFrame(columns=['Algorithm', 'Parameters', 'Train_Accuracy','Test_Accuracy', 'Test_F1_Score'])

        # Set up cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        # Define classification algorithms and their parameter grids
        classifiers = {
            'DecisionTree': (DecisionTreeClassifier(), {'max_depth': [None, 5, 10]}),
            'RandomForest': (RandomForestClassifier(), {'n_estimators': [10, 50, 100]}),
            'SVM': (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
            'KNN': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
            'LogisticRegression': (LogisticRegression(), {'C': [0.1, 1, 10]}),
            'NaiveBayes': (GaussianNB(), {})
        }

        # Iterate through classifiers, perform grid search, and store results
        for name, (classifier, param_grid) in classifiers.items():
            if name in classifiers_to_run:
                grid_search = GridSearchCV(classifier, param_grid, cv=cv, scoring='accuracy')
                grid_search.fit(X_train, y_train)

                # Get the best parameters
                best_params = grid_search.best_params_
                
                train_accuracy = accuracy_score(y_train, grid_search.best_estimator_.predict(X_train))

                # Evaluate the model on the test set
                y_pred = grid_search.best_estimator_.predict(X_test)
                test_accuracy = accuracy_score(y_test, y_pred)
                test_f1_score = f1_score(y_test, y_pred, average='weighted')

                # Store the results in the DataFrame
                results_df = pd.concat([results_df, pd.DataFrame({
                    'Algorithm': [name],
                    'Parameters': [best_params],
                    'Train_Accuracy': [train_accuracy],
                    'Test_Accuracy': [test_accuracy],
                    'Test_F1_Score': [test_f1_score]
                })], ignore_index=True)
        
        results_df.to_csv(f'Trained_models/results.csv', index=False)
        best_model = grid_search.best_estimator_
        model_trainer.save_best_model(best_model, 'best_model')
        return results_df

    def test_regression_algorithms(self, X_train, y_train, X_test, y_test, regressors_to_run=None):
        if regressors_to_run is None:
            regressors_to_run = ['RandomForest', 'SVR', 'KNN', 'LinearRegression', 'DecisionTree']

        results_df = pd.DataFrame(columns=['Algorithm', 'Parameters','Train_R2_Score', 'Test_MSE', 'Test_R2_Score'])

        # Set up cross-validation
        cv = 5  # Use 5-fold cross-validation for regression

        # Define regression algorithms and their parameter grids
        regressors = {
            'RandomForest': (RandomForestRegressor(), {'n_estimators': [10, 50, 100]}),
            'SVR': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'epsilon': [0.1, 0.2, 0.3]}),
            'KNN': (KNeighborsRegressor(), {'n_neighbors': [3, 5, 7]}),
            'LinearRegression': (LinearRegression(), {}),
            'DecisionTree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]})
        }

        # Iterate through regressors, perform grid search, and store results
        for name, (regressor, param_grid) in regressors.items():
            if name in regressors_to_run:
                grid_search = GridSearchCV(regressor, param_grid, cv=cv, scoring='neg_mean_squared_error')
                grid_search.fit(X_train, y_train)

                # Get the best parameters
                best_params = grid_search.best_params_
                
                train_r2_score = r2_score(y_train, grid_search.best_estimator_.predict(X_train))

                # Evaluate the model on the test set
                y_pred = grid_search.best_estimator_.predict(X_test)
                test_mse = mean_squared_error(y_test, y_pred)
                test_r2_score = r2_score(y_test, y_pred)

                # Store the results in the DataFrame
                results_df = pd.concat([results_df, pd.DataFrame({
                    'Algorithm': [name],
                    'Parameters': [best_params],
                    'Train_R2_Score': [train_r2_score],
                    'Test_MSE': [test_mse],
                    'Test_R2_Score': [test_r2_score]
                })], ignore_index=True)

        timestamp = pd.to_datetime('today').strftime('%d-%m-%Y_%H-%M')
        results_df.to_csv(f'Trained_models/results_{timestamp}.csv', index=False)
        return results_df
    


In [18]:
# Example usage
model_trainer = ModelTrainer()

# Classification
my_classifiers = ['RandomForest', 'SVM', 'DecisionTree']
classification_results = model_trainer.test_classification_algorithms(X_train, y_train, X_test, y_test, classifiers_to_run=my_classifiers)

In [19]:
classification_results

Unnamed: 0,Algorithm,Parameters,Train_Accuracy,Test_Accuracy,Test_F1_Score
0,DecisionTree,{'max_depth': 10},0.994318,0.825758,0.827494
1,RandomForest,{'n_estimators': 100},1.0,0.876263,0.876032
2,SVM,"{'C': 0.1, 'kernel': 'linear'}",0.987374,0.994949,0.994948
