In [1]:
import os
%pwd

'/home/izam/coding/Sentiment-Analysis/research'

In [2]:
os.chdir('../')
%pwd

'/home/izam/coding/Sentiment-Analysis'

In [3]:
from dataclasses import dataclass
from pathlib import Path

# enitity
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    delimiter: str
    text_corpus_name: str
    target_col_encoded_file: str
    vectorizer_name: str
    vocabulary_name: str
    test_size: float

In [4]:
from sentimentAnalysis.constants import *
from sentimentAnalysis.utils.common import read_yaml, create_directories

In [5]:
# configuration
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            delimiter=self.schema.DELIMITER,
            text_corpus_name=config.text_corpus_name,
            target_col_encoded_file=config.target_col_encoded_file,
            vectorizer_name=config.vectorizer_name,
            vocabulary_name=config.vocabulary_name,
            test_size=config.test_size,
        )

        return data_transformation_config

In [6]:
import os
from sentimentAnalysis import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle as pkl

In [7]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def target_encode(self):
        df = pd.read_csv(self.config.data_path, delimiter=self.config.delimiter)

        df.replace(to_replace ="surprise", value =1, inplace=True)
        df.replace(to_replace ="love", value =1, inplace=True)
        df.replace(to_replace ="joy", value =1, inplace=True)
        df.replace(to_replace ="fear", value =0, inplace=True)
        df.replace(to_replace ="anger", value =0, inplace=True)
        df.replace(to_replace ="sadness", value =0, inplace=True)

        df.to_csv(os.path.join(self.config.root_dir, self.config.target_col_encoded_file),index=False)

        logger.info("encoded the target")
        logger.info("surprise, love, joy ---> 1")
        logger.info("fear, anger, sadness ---> 0")




    def clean_text_data(self):
        try:
            data = pd.read_csv(os.path.join(self.config.root_dir, self.confi.target_col_encoded_file))
            logger.info("using the target column encoded df for splitting")
        except Exception as e:
            data = pd.read_csv(self.config.data_path, delimiter=self.config.delimiter)
            logger.info(f"exception {e} found when access the target column encoded df")
            logger.info(f"using the {self.config.data_path} file for splitting")

        logger.info("cleaning corpus started")

        lm  = WordNetLemmatizer()
        corpus = []
        df_col = data['text']
        for i in df_col:
            new_item = re.sub('[^a-zA-Z]', ' ', str(i)) # taking only characters
            new_item = new_item.lower()   # lowering the text
            new_item = new_item.split()   # splitting the text into words
            # lemmatize with remove stop words
            new_item = [lm.lemmatize(word) for word in new_item if word not in set(stopwords.words('english'))]
            corpus.append(' '.join(str(x) for x in new_item)) # back to sentence

        with open(os.path.join(self.config.root_dir, self.config.text_corpus_name), 'w', newline='') as f:
            writer = csv.writer(f)
            for i in corpus:
                writer.writerow([i])
        
        logger.info(f"Saved text corpus {self.config.text_corpus_name}")




    def transform_text(self):
        # access the clean df
        try:
            corpus = pd.read_csv(os.path.join(self.config.root_dir, self.config.text_corpus_name), names=['text'], header=None)
            logger.info("using the cleaned text corpus for vectorizing")
        except Exception as e:
            corpus = pd.read_csv(self.config.data_path, delimiter=self.config.delimiter)
            corpus.drop(['label'], axis=1, inplace=True)
            logger.info(f"exception {e} found when access the cleaned text df")
            logger.info(f"using the {self.config.data_path} file for vectorizing")

        transformer = TfidfVectorizer()
        tfidf_matrix = transformer.fit_transform(corpus['text'])

        # Save the TF-IDF vectorizer to a file using pickle
        with open(os.path.join(self.config.root_dir, self.config.vectorizer_name), 'wb') as file:
            pkl.dump(transformer, file)

        # Save the vocabulary separately
        with open(os.path.join(self.config.root_dir, self.config.vocabulary_name), 'wb') as file:
            pkl.dump(transformer.vocabulary_, file)

        logger.info(f"saved the tfidf transformer with {len(transformer.vocabulary_)} words of vocabulary")




    def train_test_spliting(self):
        # access hte target col encoded df
        try:
            data = pd.read_csv(os.path.join(self.config.root_dir, self.config.target_col_encoded_file))
            logger.info(f"using the {self.config.target_col_encoded_file} for splitting")
        except Exception as e:
            data = pd.read_csv(self.config.data_path, delimiter=self.config.delimiter)
            logger.info(f"exception {e} found when access the target column encoded df")
            logger.info(f"using the {self.config.data_path} file for splitting")

        # access the clean df
        try:
            corpus = pd.read_csv(os.path.join(self.config.root_dir, self.config.text_corpus_name), names=['text'], header=None)
            logger.info("using the cleaned text corpus for splitting")
        except Exception as e:
            corpus = pd.read_csv(self.config.data_path, delimiter=self.config.delimiter)
            corpus.drop(['label'], axis=1, inplace=True)
            logger.info(f"exception {e} found when access the cleaned text df")
            logger.info(f"using the {self.config.data_path} file for splitting")

        # Split the data into training and test sets. (0.75, 0.25) split.
        X_train, X_test, y_train, y_test = train_test_split(corpus, data['label'], test_size=self.config.test_size)
        
        train = pd.concat([X_train, y_train], axis=1)
        test = pd.concat([X_test, y_test], axis=1)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.target_encode()
    data_transformation.clean_text_data()
    data_transformation.transform_text()
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2023-12-09 15:19:06,290: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-12-09 15:19:06,291: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-09 15:19:06,293: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-09 15:19:06,293: INFO: common: created directory at: artifacts]
[2023-12-09 15:19:06,294: INFO: common: created directory at: artifacts/data_transformation]
[2023-12-09 15:19:06,365: INFO: 2794344611: encoded the target]
[2023-12-09 15:19:06,365: INFO: 2794344611: surprise, love, joy ---> 1]
[2023-12-09 15:19:06,366: INFO: 2794344611: fear, anger, sadness ---> 0]
[2023-12-09 15:19:06,388: INFO: 2794344611: exception 'DataTransformation' object has no attribute 'confi' found when access the target column encoded df]
[2023-12-09 15:19:06,389: INFO: 2794344611: using the artifacts/data_ingestion/sentiment_data/train.txt file for splitting]
[2023-12-09 15:19:06,389: INFO: 2794344611: cleaning corpus started]
[2023-12-09 15:19: