In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

In [None]:
df = pd.read_csv('bbc_text_cls.csv')

In [None]:
sns.histplot(df['labels'])

In [None]:
X = df['text']
y = df['labels']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=5)

In [None]:
class Models:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    # Get score function
    def scorer(self, X_train, X_test):
        # Initialize MNB and fit to the 
        model_mnb = MultinomialNB()
        model_mnb.fit(X_train, self.y_train)
        mnb_training_score = model_mnb.score(X_train, self.y_train)
        mnb_testing_score = model_mnb.score(X_test, self.y_test)
        return mnb_training_score, mnb_testing_score

    # Pos custom function
    def get_wordnet_pos (treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet. VERB
        elif treebank_tag.startswith('N'):
            return wordnet. NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet. NOUN

    # MULTINOMINAL NAIVE BAYES
    def MNB(self):
        # Initialize Count Vectorizer
        vectorizer = CountVectorizer()
        # Fit and Transform the input data
        X_train = vectorizer.fit_transform(self.X_train)
        X_test = vectorizer.transform(self.X_test)
        # Get scores from scorer function
        mnb_scores = self.scorer(X_train, X_test)
        mnb_vocab_size = len(vectorizer.vocabulary_)
        # Print the scores
        # Multinomial Naive Bayes --> Training Score: {mnb_scores[0]}, Testing Score: {mnb_scores[1]}
        return mnb_scores[0], mnb_scores[1], mnb_vocab_size
        
        
    # Stopwords
    def STW(self):
        # Initialize Count Vectorizer
        vectorizer = CountVectorizer(stop_words='english')
        # Fit and Transform the input data
        X_train = vectorizer.fit_transform(self.X_train)
        X_test = vectorizer.transform(self.X_test)
        # Get scores from scorer function
        stw_scores = self.scorer(X_train, X_test)
        stw_vocab_size = len(vectorizer.vocabulary_)
        # Print the scores
        # Stopwords --> Training Score: {stw_scores[0]}, Testing Score: {stw_scores[1]}
        return stw_scores[0], stw_scores[1], stw_vocab_size

    
    # SIMPLE TOKENIZER
    def simple_tokenizer(self, doc):
        return doc.split()
        
    def SIM(self):
        vectorizer = CountVectorizer(tokenizer=self.simple_tokenizer)
        # Fit and Transform the input data
        X_train = vectorizer.fit_transform(self.X_train)
        X_test = vectorizer.transform(self.X_test)
        # Get scores from scorer function
        sim_scores = self.scorer(X_train, X_test)
        sim_vocab_size = len(vectorizer.vocabulary_)
        # Print the scores
        # Simple Tokenizer --> Training Score: {sim_scores[0]}, Testing Score: {sim_scores[1]}
        return sim_scores[0], sim_scores[1], sim_vocab_size

    
    # STEMMING
    class StemTokenizer:
        def __init__(self):
            self.porter = PorterStemmer()
        def __call__(self, doc):
            tokens = word_tokenize(doc)
            return [self.porter.stem(t) for t in tokens]
       
    def STEM(self):
        # Initialize Count Vectorizer
        vectorizer = CountVectorizer(tokenizer = self.StemTokenizer())
        # Fit and Transform the input data 
        X_train = vectorizer.fit_transform(self.X_train)
        X_test = vectorizer.transform(self.X_test)
        # Get scores from scorer function
        stem_scores = self.scorer(X_train, X_test)
        stem_vocab_size = len(vectorizer.vocabulary_)
        # Print the scores
        # Stemming --> Training Score: {stem_scores[0]}, Testing Score: {stem_scores[1]}
        return stem_scores[0], stem_scores[1], stem_vocab_size

    
    # LEMMATIZATION
    class LemmaTokenizer:
        def __init__(self):
            self.wnl = WordNetLemmatizer() #initialize wordnetlemmatizer
        def __call__(self, doc): 
            tokens = word_tokenize(doc) 
            words_and_tags = nltk.pos_tag(tokens)
            return [self.wnl.lemmatize(word, pos=Models.get_wordnet_pos(tag)) for word, tag in words_and_tags]

    def LEM(self):
        # Initialize Count Vectorizer
        vectorizer = CountVectorizer(tokenizer=self.LemmaTokenizer())
        # Fit and Transform the input data 
        X_train = vectorizer.fit_transform(self.X_train)
        X_test = vectorizer.transform(self.X_test)
        # Get scores from scorer function
        lem_scores = self.scorer(X_train, X_test)
        lem_vocab_size = len(vectorizer.vocabulary_)
        # Print the scores
        # Lemminization --> Training Score: {lem_scores[0]}, Testing Score: {lem_scores[1]}
        return lem_scores[0], lem_scores[1], lem_vocab_size
    
    

def run():
    runner = Models(X_train, X_test, y_train, y_test)
    mnb = runner.MNB()
    stw = runner.STW() 
    sim = runner.SIM() 
    stem=runner.STEM()
    lem=runner.LEM()
    # Label of each model
    labels = ['Multinomial Naives Bayes', 'StopWords', 'Simple Tokenizer', 'Stemming', 'Lemminization']
    # Put the tuple in a list
    list_of_models = [mnb, stw, sim, stem, lem]
    # Iterate through the list of models and populate the DataFrame
    train_scores = []
    test_scores = []
    vocab_sizes = []
    for model in list_of_models:
        train_score = model[0]
        test_score = model[1]
        vocab_size = model[2]
        train_scores.append(train_score)
        test_scores.append(test_score)
        vocab_sizes.append(vocab_size)
        
    df_model_performance = pd.DataFrame({'label':labels, 'train_score':train_scores, 'test_score':test_scores, 'vocab_size':vocab_sizes})
    print(df_model_performance)
    
    # Set the labels and scores
    labels = df_model_performance['label']
    train_scores = df_model_performance['train_score']
    test_scores = df_model_performance['test_score']
    # Plot the line chart
    plt.plot(labels, train_scores, marker='o', label='Train Score', linestyle='-', color='blue')
    plt.plot(labels, test_scores, marker='o', label='Test Score', linestyle='-', color='orange')
    # Add labels and title
    plt.xlabel('Models', fontweight='bold')
    plt.xticks(rotation = 45)
    plt.ylabel('Score', fontweight='bold')
    plt.title('Train and Test Scores Across Models')
    # Add a legend
    plt.legend()
    # Show the plot
    plt.show()