In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import text
from sklearn.compose import ColumnTransformer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [19]:
df1 = pd.read_csv('../data/new_data_part1.csv')
df2 = pd.read_csv('../data/new_data_part2.csv')
df = pd.concat([df1, df2], ignore_index=True)

In [20]:
df.shape

(10053, 8)

In [21]:
df.head()

Unnamed: 0,title,adj_num,verb_num,noun_num,adv_num,subreddit,selftext,word_count
0,UFC Fight Pass Streaming Quality,0,0,5,0,1,0,5
1,"Will UFC 284 sell 1,000,000 PPVs?",0,1,2,0,1,0,6
2,The ONE Championship team and CEO Chatri Sityo...,0,0,12,0,0,0,18
3,Yoel look tiny,1,1,1,0,1,0,3
4,Who is your favorite prospect going into 2023?,1,2,1,0,1,0,8


In [None]:
X = df.drop(columns = 'subreddit')
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                    random_state=42)

In [None]:
my_words_list= (['ufc', 'dana', 'white', 'ultimate', 'u.f.c.', 'islam', 'makhachev', 'moreno', 'edwards', 'usman', 'ngannou', 'adesanya',
                  'pantoja', 'kara', 'kai', 'oliveira', 'pereira', 'sterling', 'royval', 'nicolau', 'perez', 'albazi', 'schnell', 'omalley', 'yan',
                  'dvalishvili', 'vera', 'sandhagen', 'font', 'cruz', 'holloway', 'volkanovski', 'figueiredo', 'deiveson', 'aljamain', 'rodriguez',
                  'ortega', 'allen', 'emmett', 'chan', 'sung', 'jung', 'kattar', 'giga', 'chikadze', 'poirier', 'jones', 'elliott', 'dvorak', 'molina', 'mokaev',
                  'ulanbekov', 'yanez', 'gutierrez', 'nurmagomedov', 'simon', 'munhoz', 'shore', 'topuria', 'evloev', 'mitchell', 'yusuff', 'iga', 'barboza',
                  'caceres', 'burns', 'neal', 'luque', 'fiziev', 'gamrot', 'anjos', 'tsarukyan', 'turner', 'hooker', 'ismagulov', 'gaethje', 'magny', 'whittaker',
                  'vettori', 'strickland', 'costa', 'hermansson', 'covington', 'muniz', 'imavov', 'bachowicz', 'rakic', 'cannonier', 'dolidze', 'brunson', 'oezdemir',
                  'spann', 'walker', 'nunes', 'weili', 'shevchenko', 'pena', 'blaydes', 'tuivasa', 'aspinall', 'andrade', 'santos', 'daukaus', 'tybura', 'lewis', 'holm',
                  'vieira', 'jandiroba', 'maia', 'grasso', 'chookagian', 'murphy', 'fiorot', 'lemos', 'namajunas', 'esparza', 'jandiroba', 'blanchfield', 'barber',
                  'calvillo', 'ribas', 'viana', 'ducote', 'pinheiro', 'xiaonan', 'yan', 'abdurakhimov', 'spivac', 'shamil', 'ketlen', 'pennington', 'miesha', 'kunitskaya',
                  'rosa', 'avila', 'lansberg', 'paddy', 'silva', 'cormier', 'diaz', 'miocic', 'lesnar', 'penn', 'liddell', 'pierre', 'rousey', 'khabib', 'conor', 'mcgregor',
                  'frevola', 'dillashaw', 'pimblett', 'helwani', 'blachowicz','arlovski', 'donatello', 'dec', 'december', 'jan', 'feb', 'selftext', 'says', 'did', 'does',
                  'guy', 'guys', 'know', 'fc', 'vs', 'https', 'khamzat', '2022', '2023', '219', '281', '282', '283', '284', '285', 'going', 'man', 'got', 'anne', 'didnt', 
                  'ufc281', 'ankalaev', 'zhang', 'israel', 'johnson', 'dustin', 'krause', 'chandler', 'jiri', 'cejudo', 'march', 'february', 'gordon', 'ilia', 'florian',
                  'makachov', 'beneil', 'dariush', 'jared', 'bryce', 'shavkat', 'november', 'saturday'])
stop_words_list = text.ENGLISH_STOP_WORDS.union(my_words_list)

In [22]:
ct = ColumnTransformer([
    ('cvec', CountVectorizer(max_features=4000,
                             max_df = 0.1,
                             min_df = 2,
                             stop_words=stop_words_list,
                             ngram_range=(1,1)), 'title')
])

lvl1_est_1 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('ada', AdaBoostClassifier())
]

lvl1_est_2 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('logr', LogisticRegression(max_iter=1000))
]

lvl1_est_3 = [
    ('nb', MultinomialNB()),
    ('logr', LogisticRegression(max_iter=1000)),
    ('ada', AdaBoostClassifier())
]

lvl1_est_4 = [
    ('logr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier()),
    ('ada', AdaBoostClassifier())
]

In [259]:
stacked_1 = StackingClassifier(estimators=lvl1_est_1, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe01 = Pipeline([
    ('ct', ct),
    ('s1', stacked_1)
])

print(cross_val_score(pipe01, X_train, y_train).mean())
pipe01.fit(X_train, y_train)
print(f'Training Score: {pipe01.score(X_train, y_train)}')
print(f'Testing Score: {pipe01.score(X_test, y_test)}')

0.7210505703709831
Training Score: 0.901180527921475
Testing Score: 0.7235481304693715


In [23]:
#BEST MODEL THUS FAR (in terms test score and cross-val)
stacked_2 = StackingClassifier(estimators=lvl1_est_2, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe02 = Pipeline([
    ('ct', ct),
    ('s1', stacked_2)
])

print(cross_val_score(pipe02, X_train, y_train).mean())
pipe02.fit(X_train, y_train)
print(f'Training Score: {pipe02.score(X_train, y_train)}')
print(f'Testing Score: {pipe02.score(X_test, y_test)}')

0.7190624125434093
Training Score: 0.9121899456161294
Testing Score: 0.7287191726332538


In [261]:
stacked_3 = StackingClassifier(estimators=lvl1_est_3, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe03 = Pipeline([
    ('ct', ct),
    ('s1', stacked_3)
])

print(cross_val_score(pipe03, X_train, y_train).mean())
pipe03.fit(X_train, y_train)
print(f'Training Score: {pipe03.score(X_train, y_train)}')
print(f'Testing Score: {pipe03.score(X_test, y_test)}')

0.7197237823842404
Training Score: 0.8379095370738825
Testing Score: 0.7235481304693715


In [262]:
stacked_4 = StackingClassifier(estimators=lvl1_est_4, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe04 = Pipeline([
    ('ct', ct),
    ('s1', stacked_4)
])

print(cross_val_score(pipe04, X_train, y_train).mean())
pipe04.fit(X_train, y_train)
print(f'Training Score: {pipe04.score(X_train, y_train)}')
print(f'Testing Score: {pipe04.score(X_test, y_test)}')

0.7144184785765455
Training Score: 0.9258522350444356
Testing Score: 0.7231503579952268


---

Trying the best stacked model but with different final estimator

In [263]:
stacked_5 = StackingClassifier(estimators=lvl1_est_2, 
                               final_estimator=KNeighborsClassifier(),
                               n_jobs = -1)
pipe05 = Pipeline([
    ('ct', ct),
    ('s1', stacked_5)
])


print(cross_val_score(pipe05, X_train, y_train).mean())
pipe05.fit(X_train, y_train)
print(f'Training Score: {pipe05.score(X_train, y_train)}')
print(f'Testing Score: {pipe05.score(X_test, y_test)}')

0.6796662436481213
Training Score: 0.8263695450324977
Testing Score: 0.6992840095465394


In [264]:
stacked_5_2 = StackingClassifier(estimators=lvl1_est_2, 
                               final_estimator=KNeighborsClassifier(n_neighbors = 7),
                               n_jobs = -1)
pipe05 = Pipeline([
    ('ct', ct),
    ('s1', stacked_5_2)
])

print(cross_val_score(pipe05, X_train, y_train).mean())
pipe05.fit(X_train, y_train)
print(f'Training Score: {pipe05.score(X_train, y_train)}')
print(f'Testing Score: {pipe05.score(X_test, y_test)}')

0.6908096434147277
Training Score: 0.8319405756731661
Testing Score: 0.6865552903739062


---

## Voting Classifier

In [10]:
vote = VotingClassifier([
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(n_jobs = -1)),
    ('logr', LogisticRegression(max_iter=1000))
])

pipe = Pipeline([
    ('ct', ct),
    ('vote', vote)
])

pipe.fit(X_train, y_train)
print(f'Training Score: {pipe.score(X_train, y_train)}')
print(f'Testing Score: {pipe.score(X_test, y_test)}')

Training Score: 0.8792943361188487
Testing Score: 0.7251392203659507


In [11]:
vote = VotingClassifier([
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(n_jobs = -1)),
    ('logr', LogisticRegression(max_iter=1000))
], 
    voting = 'soft')

pipe = Pipeline([
    ('ct', ct),
    ('vote', vote)
])

pipe.fit(X_train, y_train)
print(f'Training Score: {pipe.score(X_train, y_train)}')
print(f'Testing Score: {pipe.score(X_test, y_test)}')

Training Score: 0.8942830614139806
Testing Score: 0.7267303102625299
