# Multinomial Naive Bayes Prediction
This notebook uses a pre-trained model for predicting 8 subreddit class

## Import Libraries and Functions
- Import libraries
- Import stopwords
- Set pandas options and settings

In [8]:
# * Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from pandas import DataFrame
import pandas as pd
import pathlib
import joblib
import re

cleaning_regex = re.compile(r"[^\w\s]")
current_path = str(pathlib.Path().resolve())

model_dir = f'{current_path}\\models\\scratch\\'
model_sklearn_dir = f'{current_path}\\models\\sklearn\\model\\'
tfidf_dir = f'{current_path}\\models\\sklearn\\tfidf\\'

# * Import stopwords
print(f'Loading English and Tagalog stopwords...')
eng_file_path = f'{current_path}\\stopwords\\english_stopwords.txt'
tag_file_path = f'{current_path}\\stopwords\\tagalog_stopwords.txt'
eng_stop = set()
tag_stop = set()

with open(eng_file_path, 'r') as f:
    for i, line in enumerate(f):
        eng_stop.add(line.replace('\n', ''))

with open(tag_file_path, 'r') as f:
    for i, line in enumerate(f):
        tag_stop.add(line.replace('\n', ''))

# * Function implementation for predicting the class given the text
def predict(token: str, classes: list[str], likelihoods_df: DataFrame) -> str:
    """ Returns the predicted class """
    max_likelihood = {}

    # Maximize the likelihood for each class
    for c in classes:
        max_likelihood[c] = 1

        # Calculate the maximum likelihood estimate for a class
        for i in token.split(' '):
            if i in likelihoods_df.index:
                # Sum the logarithm of probabilities (log-likelihood)
                max_likelihood[c] += likelihoods_df.loc[i][c]
    
    # Yield the class that has the highest log-probability score
    prediction = max(max_likelihood, key=max_likelihood.get)
    return prediction

# Prediction function using the loaded model and vectorizer
def predict_sklearn(token: str, tfidf: TfidfVectorizer, model: MultinomialNB):
    # Transform the input text using the loaded TF-IDF vectorizer
    text_transformed = tfidf.transform([token])
    # Make a prediction
    prediction = model.predict(text_transformed)
    # Return the predicted subreddit
    return prediction[0]

classes = [
    'DeepThoughts', 
    'CryptoCurrencies', 
    'askphilosophy', 
    'computerscience',
    'LawSchool', 
    'Wallstreetbetsnew', 
    'PoliticalDiscussion', 
    'geopolitics'
]


Loading English and Tagalog stopwords...


## Import Pre-trained Model

### Pre-trained Model from Scratch
Multinomial Naive Bayes model trained on a corpus from 8 subreddit class

In [None]:
print('Importing model...')
model = pd.read_csv(f'{model_dir}mnb_model_scratch2.csv', index_col='term')
model.sort_values(by=['computerscience'], ascending=False)

#### Prediction

In [6]:
# Paste the body of text and/or title from a submission
text = """
I always bomb cold calls in contracts, I’m coming to class prepared. But if the question isn’t something directly related to the brief or questions she gives us to do for each case, I usually can’t come up with an adequate answer in time. And so far majority of the time when I get cold called, it usually is a question about something else I don’t have “prepared”

Has anyone had any experience with professors doing this? In the syllabus, it says they reserve the right to increase or lower a students grade based on “class preparation”
"""

# Clean the text
tokenize = lambda x: ' '.join([token for token in word_tokenize(x) if len(token) > 2 and token not in eng_stop and token not in tag_stop])
text = tokenize(re.sub(cleaning_regex, ' ', text.lower()))

predict(text, classes, model)

'LawSchool'

### Pre-trained Model using SKLearn
Multinomial Naive Bayes model trained on a corpus from 8 subreddit class

In [10]:
print('Importing model and TF-IDF...')
# Load the saved model
model = joblib.load(f'{model_sklearn_dir}mnb_model_sklearn2.pkl')

# Load the saved TF-IDF vectorizer
tfidf = joblib.load(f'{tfidf_dir}mnb_tfidf_sklearn2.pkl')

Importing model and TF-IDF...


#### Prediction

In [19]:
# Paste the body of text and/or title from a submission
text = """
Hi everyone i just got accepted into computer science and probably not changing it i do live in a third world country so there isnt that much interest in it so i think i have a good chance of becoming something so i have 3 questions what should i try to achieve in my 4 years of computer science to be at least somewhat above average and does computer science have physics or math?(My fav subjects) And is computer science generally hard?
"""

# Clean the text
tokenize = lambda x: ' '.join([token for token in word_tokenize(x) if len(token) > 2 and token not in eng_stop and token not in tag_stop])
tokens = tokenize(re.sub(cleaning_regex, ' ', text.lower()))

predict_sklearn(tokens, tfidf, model)

np.str_('computerscience')