In [1]:
import os
%pwd

'd:\\Machine_Learning\\Titanic_Pipeline_Project\\research'

In [2]:
os.chdir("../")
%pwd

'd:\\Machine_Learning\\Titanic_Pipeline_Project'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir:Path
    # data_path:Path
    model_path:Path

In [4]:
from titanic.constants import *
from titanic.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_file_path = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH
            ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer 

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir= config.root_dir,
            # data_path= config.data_path,
            model_path= config.model_path
        )

        return model_trainer_config


In [134]:
from sklearn.metrics import accuracy_score
from titanic.logging import logger
import pandas as pd

# from sklearn.preprocessing import train_test_split
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
# from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
# from sklearn.tree import train_test_split

from sklearn.model_selection import train_test_split

from titanic.utils.common import evaluate_models, load_object, save_object

from titanic.config.configuration import ConfigurationManager as ConfMan
from titanic.components.data_transformation import DataTransformation 

from sklearn.metrics import accuracy_score

class ModelTrainer:
    def __init__(self, config:ModelTrainerConfig):
        self.config = config

    def fetch_preprocessor(self, preprocessor_file_path):
        preprocessor_obj = load_object(Path(preprocessor_file_path))
        return preprocessor_obj
    
    def call_data_transformation(self):
        config = ConfMan()
        data_tranformation_config = config.get_data_transformation_config()
        data_transformation = DataTransformation(data_tranformation_config)
        data_transformation.save_transformed_data()

    def initiate_model_training(self):
        config = ConfMan()
        data_tranformation_config = config.get_data_transformation_config()
        data_transformation = DataTransformation(data_tranformation_config)
        train_data, test_data, preprocessor_file_path = data_transformation.initiate_data_transformation()
        x_train, y_train, x_test, y_test = (
                train_data[:,:-1],
                train_data[:,-1],
                test_data[:,:-1],
                test_data[:,-1]
        )

        model = KNeighborsClassifier(n_neighbors=30,p=1, leaf_size=75) # 0.7877094972067039
        # model = AdaBoostClassifier() # 0.7821229050279329
        # model = CatBoostClassifier(learning_rate=0.0005, iterations=2000) # 0.7877094972067039

        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)

        accuracy_of_model = accuracy_score(y_test, y_pred)
        logger.info(f"The accuracy of the model is: {accuracy_of_model}")

        model_path= Path(os.path.join(self.config.root_dir,"model.pkl"))

        save_object(path = model_path,obj =model)
        logger.info(f"Model has been saved to: {model_path} with accuracy: {accuracy_of_model}")

        # models = {
        #         "Random Forest": RandomForestClassifier(),
        #         "Decision Tree": DecisionTreeClassifier(),
        #         "Gradient Boosting": GradientBoostingClassifier(),
        #         "K-Neighbors Regressor": KNeighborsClassifier(),
        #         "CatBoosting Regressor": CatBoostClassifier(verbose=False),
        #         "AdaBoost Regressor": AdaBoostClassifier()
        #         # "XGBRegressor": XGBClassifier(),
        #         # "Linear Regression": LinearRegression(),
        #     }
        
        # Performing model selection
        # accuracy_dict = {}

        # for model_name, model in models.items():
        #     model.fit(x_train, y_train)
        #     logger.info(f"Model {model_name} has been trained successfully")
        #     y_pred = model.predict(x_test)
        #     accuracy = accuracy_score(y_test, y_pred)
        #     accuracy_dict[accuracy] = (model, model_name, accuracy)

        # best_model = accuracy_dict[max(accuracy_dict)][0]



# class ModelTrainer:
#     def __init__(self, config:ModelTrainerConfig):
#         self.config = config

#     def fetch_transformed_data(self):
#         df = pd.read_csv(self.config.data_path)
#         logger.info("Data has been fetched successfully")
#         return df
    
#     def scaling_data(self):
#         df = self.fetch_transformed_data()

#         x = df.drop(columns='Survived', axis=1)
#         y = df['Survived']

#         sd = StandardScaler()
#         x = sd.fit_transform(x)


#         logger.info("Data has been scaled successfully")
#         return x,y

#     def initiate_model_training(self):
#         x, y = self.scaling_data()

        # models = {
        #         "Random Forest": RandomForestClassifier(),
        #         "Decision Tree": DecisionTreeClassifier(),
        #         "Gradient Boosting": GradientBoostingClassifier(),
        #         # "K-Neighbors Regressor": KNeighborsClassifier(),
        #         "CatBoosting Regressor": CatBoostClassifier(verbose=False),
        #         "AdaBoost Regressor": AdaBoostClassifier()
        #         # "XGBRegressor": XGBClassifier(),
        #         # "Linear Regression": LinearRegression(),
        #     }
        
#         # For Hyper Parameter tuning 
#         params={
#                 "Decision Tree": {
#                     'criterion':['log_loss', 'entropy', 'gini'],
#                     'splitter':['best','random'],
#                     'max_features':['sqrt','log2'],
#                 },
#                 "Random Forest":{
#                     'criterion':['log_loss', 'entropy', 'gini'],
                 
#                     'max_features':['sqrt','log2',None],
#                     'n_estimators': [8,16,32,64,128,256]
#                 },
#                 "Gradient Boosting":{
#                     # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
#                     'learning_rate':[.1,.01,.05,.001],
#                     'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
#                     # 'criterion':['squared_error', 'friedman_mse'],
#                     # 'max_features':['auto','sqrt','log2'],
#                     'n_estimators': [8,16,32,64,128,256]
#                 },
#                 # "Linear Regression":{},
#                 # "XGBRegressor":{
#                 #     'learning_rate':[.1,.01,.05,.001],
#                 #     'n_estimators': [8,16,32,64,128,256]
#                 # },
#                  "CatBoosting Regressor":{
#                     'depth': [6,8,10],
#                     'learning_rate': [0.01, 0.05, 0.1],
#                     'iterations': [30, 50, 100]
#                 },
#                 "AdaBoost Regressor":{
#                     'learning_rate':[.1,.01,0.5,.001],
#                     # 'loss':['linear','square','exponential'],
#                     'n_estimators': [8,16,32,64,128,256]
#                 }
                
#             }
        
#         x_train,x_test, y_train,  y_test = train_test_split(x,y, test_size=0.2, random_state=42)

#         # To get best model score from dict
#         # model_report:dict = evaluate_models(x_train = x_train, y_train = y_train,x_test=x_test, y_test=y_test, models=models, param = params)
#         # best_model_score = max(sorted(model_report.values()))
#         # print(f"---------------------------------------------{best_model_score}--------------------------------")
#         # print(model_report)

        # accuracy_dict = {}
        # for model_name, model in models.items():
        #     model.fit(x, y)
        #     logger.info(f"Model {model_name} has been trained successfully")
        #     y_pred = model.predict(x_test)
        #     accuracy = accuracy_score(y_test, y_pred)
        #     accuracy_dict[accuracy] = (model, model_name, accuracy)
        
#         print(f"{accuracy_dict}")
        
#         # best_model = list(models.values())[accuracy_list.index(max(accuracy_list))]
#         # best_model = max(list(accuracy_dict.values()))
#         best_model = accuracy_dict[max(accuracy_dict)][0]
        
#         model_path= Path(os.path.join(self.config.root_dir,"model.pkl"))

#         save_object(path = model_path,obj =best_model)
#         logger.info(f"Model Name: {accuracy_dict[max(accuracy_dict)][1]} has been saved successfully with accuracy: {accuracy_dict[max(accuracy_dict)][2]}")

#         logger.info("Model Training Complete")

In [135]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config= model_trainer_config)
    model_trainer.initiate_model_training()

except Exception as e:
    raise e

[2024-01-09 19:45:24,448: INFO: common: yaml file config\config.yaml loaded successfully]
[2024-01-09 19:45:24,457: INFO: common: yaml file params.yaml loaded successfully]
[2024-01-09 19:45:24,458: INFO: common: created directory at: artifacts]
[2024-01-09 19:45:24,460: INFO: common: created directory at: artifacts/model_trainer]
[2024-01-09 19:45:24,465: INFO: common: yaml file config\config.yaml loaded successfully]
[2024-01-09 19:45:24,469: INFO: common: yaml file params.yaml loaded successfully]
[2024-01-09 19:45:24,473: INFO: common: created directory at: artifacts]
[2024-01-09 19:45:24,473: INFO: common: created directory at: artifacts/data_transformation]
[2024-01-09 19:45:24,473: INFO: data_transformation: Preprocessing pipeline]
[2024-01-09 19:45:24,481: INFO: data_transformation: Loaded 891 rows of data]
[2024-01-09 19:45:24,497: INFO: data_transformation: Preprocessor Saved]
[2024-01-09 19:45:24,505: INFO: 1163192567: The accuracy of the model is: 0.7877094972067039]
[2024-