In [1]:
import os
%pwd

'/home/izam/coding/Sentiment-Analysis/research'

In [2]:
os.chdir("../")
%pwd

'/home/izam/coding/Sentiment-Analysis'

In [3]:
from dataclasses import dataclass
from pathlib import Path

# entity
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    model_params: dict
    target_column: str
    vectorizer_path: str
    vocabulary_path: str

In [4]:
from sentimentAnalysis.constants import *
from sentimentAnalysis.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            model_params = params.MultinomialNB,
            target_column = schema.name,
            vectorizer_path=config.vectorizer_path,
            vocabulary_path=config.vocabulary_path,
            
        )

        return model_trainer_config

In [6]:
import pandas as pd
import os
from sentimentAnalysis import logger
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import joblib
from sentimentAnalysis.utils.common import load_pickle, save_pickle

In [7]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    
    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)


        X_train = train_data.drop([self.config.target_column], axis=1)
        X_test = test_data.drop([self.config.target_column], axis=1)
        y_train = train_data[self.config.target_column]
        y_test = test_data[self.config.target_column]
        print(self.config.vocabulary_path)
        vocabulary = load_pickle(path = Path(self.config.vocabulary_path))
        vectorizer = load_pickle(path =  Path(self.config.vectorizer_path))
        vectorizer.vocabulary_ = vocabulary

        X_train = vectorizer.transform(X_train['text'])

        logger.info(f"Transformed the X_train and X_test, new shape of X_train - {X_train.shape}")

        grid_search =  GridSearchCV(MultinomialNB(), self.config.model_params ,cv=5,return_train_score=True,n_jobs=-1)

        grid_search.fit(X_train,y_train)
        logger.info(f"found best mode at {grid_search.best_params_}")
        
        model = grid_search.best_estimator_

        save_pickle(path= Path(os.path.join(self.config.root_dir, self.config.model_name)), data=model)

In [8]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2023-12-09 19:17:03,216: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-12-09 19:17:03,219: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-09 19:17:03,220: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-09 19:17:03,221: INFO: common: created directory at: artifacts]
[2023-12-09 19:17:03,221: INFO: common: created directory at: artifacts/model_trainer]
artifacts/data_transformation/vocabulary.pkl
[2023-12-09 19:17:03,243: INFO: common: pickle file loaded from: artifacts/data_transformation/vocabulary.pkl]
[2023-12-09 19:17:03,252: INFO: common: pickle file loaded from: artifacts/data_transformation/tfidf_vectorizer.pkl]
[2023-12-09 19:17:03,370: INFO: 554699130: Transformed the X_train and X_test, new shape of X_train - (16200, 14295), X_test - (1800, 14295)]
[2023-12-09 19:17:05,572: INFO: 554699130: found best mode at {'fit_prior': False}]
[2023-12-09 19:17:05,575: INFO: common: pickle file saved at: artifacts/model_trai