In [6]:
import os

In [7]:
%pwd

'D:\\Desktop\\NLP\\Lab 1\\NLP-Tokenization-and-Language-Modeling'

In [8]:
os.chdir("../")

In [9]:
%pwd

'D:\\Desktop\\NLP\\Lab 1'

In [13]:
import logging
import os
import random
import joblib
from dataclasses import dataclass
from pathlib import Path
from collections import Counter, defaultdict

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    train_file: Path
    test_file: Path

class ConfigurationManager:
    def __init__(self):
        self.root_dir = Path("D:/Desktop/NLP/Lab 1/NLP-Tokenization-and-Language-Modeling")
        self.train_file = self.root_dir / "dataset/UnzippedAngular/train.txt"
        self.test_file = self.root_dir / "dataset/UnzippedAngular/test.txt"

    def get_data_transformation_config(self) -> DataTransformationConfig:
        return DataTransformationConfig(
            root_dir=self.root_dir,
            train_file=self.train_file,
            test_file=self.test_file,
        )

# Language Model Functions
import spacy

# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

def spacy_tokenize(text):
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_punct and not token.is_space]

def create_unigram_model(tokens):
    return Counter(tokens)

def create_bigram_model(tokens):
    bigram_model = defaultdict(Counter)
    for prev_word, curr_word in zip(tokens[:-1], tokens[1:]):
        bigram_model[prev_word][curr_word] += 1
    return bigram_model

def save_model(model, file_path):
    joblib.dump(model, file_path)


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_model(self):
        with open(self.config.train_file, 'r', encoding='utf-8') as file:
            train_text = file.read()

        # Use spaCy for tokenization
        tokens = spacy_tokenize(train_text.lower())
        self.unigram_model = create_unigram_model(tokens)
        self.bigram_model = create_bigram_model(tokens)
        logging.info("Model training completed")
        # Save models
        save_model(self.unigram_model, self.config.root_dir / "Models/unigrammodel.pkl")
        logging.info("Unigram Model Saved")
        save_model(self.bigram_model, self.config.root_dir / "Models/bigrammodel.pkl")
        logging.info("Bigram Model Saved")

def main():
    try:
        config_manager = ConfigurationManager()
        data_transformation_config = config_manager.get_data_transformation_config()
        data_transformation = DataTransformation(config=data_transformation_config)
        
        # Train the model using train.txt
        data_transformation.train_model()


    except Exception as e:
        logging.error(f"Error occurred: {e}")
        raise e

if __name__ == "__main__":
    main()


2024-01-31 22:23:41,400 - INFO - Model training completed
2024-01-31 22:23:41,420 - INFO - Unigram Model Saved
2024-01-31 22:23:41,516 - INFO - Bigram Model Saved
