In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import text
from sklearn.compose import ColumnTransformer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [35]:
df = pd.read_csv('../data/ufc_mma_submissions.csv')

X = df['title']
y = df['subreddit']

In [28]:
my_words_list= (['ufc', 'dana', 'white', 'ultimate', 'u.f.c.', 'islam', 'makhachev', 'moreno', 'edwards', 'usman', 'ngannou', 'adesanya',
                  'pantoja', 'kara', 'kai', 'oliveira', 'pereira', 'sterling', 'royval', 'nicolau', 'perez', 'albazi', 'schnell', 'omalley', 'yan',
                  'dvalishvili', 'vera', 'sandhagen', 'font', 'cruz', 'holloway', 'volkanovski', 'figueiredo', 'deiveson', 'aljamain', 'rodriguez',
                  'ortega', 'allen', 'emmett', 'chan', 'sung', 'jung', 'kattar', 'giga', 'chikadze', 'poirier', 'jones', 'elliott', 'dvorak', 'molina', 'mokaev',
                  'ulanbekov', 'yanez', 'gutierrez', 'nurmagomedov', 'simon', 'munhoz', 'shore', 'topuria', 'evloev', 'mitchell', 'yusuff', 'iga', 'barboza',
                  'caceres', 'burns', 'neal', 'luque', 'fiziev', 'gamrot', 'anjos', 'tsarukyan', 'turner', 'hooker', 'ismagulov', 'gaethje', 'magny', 'whittaker',
                  'vettori', 'strickland', 'costa', 'hermansson', 'covington', 'muniz', 'imavov', 'bachowicz', 'rakic', 'cannonier', 'dolidze', 'brunson', 'oezdemir',
                  'spann', 'walker', 'nunes', 'weili', 'shevchenko', 'pena', 'blaydes', 'tuivasa', 'aspinall', 'andrade', 'santos', 'daukaus', 'tybura', 'lewis', 'holm',
                  'vieira', 'jandiroba', 'maia', 'grasso', 'chookagian', 'murphy', 'fiorot', 'lemos', 'namajunas', 'esparza', 'jandiroba', 'blanchfield', 'barber',
                  'calvillo', 'ribas', 'viana', 'ducote', 'pinheiro', 'xiaonan', 'yan', 'abdurakhimov', 'spivac', 'shamil', 'ketlen', 'pennington', 'miesha', 'kunitskaya',
                  'rosa', 'avila', 'lansberg', 'paddy', 'silva', 'cormier', 'diaz', 'miocic', 'lesnar', 'penn', 'liddell', 'pierre', 'rousey', 'khabib', 'conor', 'mcgregor',
                  'frevola', 'dillashaw', 'pimblett', 'helwani', 'blachowicz','arlovski', 'donatello', 'dec', 'december', 'jan', 'feb', 'selftext', 'says', 'did', 'does',
                  'guy', 'guys', 'know', 'fc', 'vs', 'https', 'khamzat', '2022', '2023', '219', '281', '282', '283', '284', '285', 'going', 'man', 'got', 'anne', 'didnt', 
                  'ufc281', 'ankalaev', 'zhang', 'israel', 'johnson', 'dustin', 'krause', 'chandler', 'jiri', 'cejudo', 'march', 'februrary', 'gordon', 'ilia', 'florian',
                  'makachov', 'beneil', 'dariush', 'jared'])
stop_words_list = text.ENGLISH_STOP_WORDS.union(my_words_list)

In [6]:
#Best Model from Previous Investigation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                   stratify = y)

lvl1_est_1 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('logr', LogisticRegression(max_iter = 1000))
]

stacked_2 = StackingClassifier(estimators=lvl1_est_1, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_cvec_2 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('s2', stacked_2)
])

print(cross_val_score(pipe_cvec_2, X_train, y_train).mean())
pipe_cvec_2.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_2.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_2.score(X_test, y_test)}')

0.7282135181707293
Training Score: 0.943891762833267
Testing Score: 0.7402545743834527


---

In [29]:
# Got a LOT of help from: https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/

wnl = WordNetLemmatizer()

def lem_word_wnl(string):
    '''
    This function splits a string on spaces, uses .lemmatize from wnl on each word
    of the string, and then rejoins the string. For future vectorizing and modeling.
    '''
    string = string.split()
    return ' '.join([wnl.lemmatize(i) for i in string])

In [30]:
X1 = X.map(lambda i: lem_word_wnl(i))

In [9]:
X1.head()

0                     UFC Fight Pass Streaming Quality
1                    Will UFC 284 sell 1,000,000 PPVs?
2    The ONE Championship team and CEO Chatri Sityo...
3                                       Yoel look tiny
4       Who is your favorite prospect going into 2023?
Name: title, dtype: object

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=42,
                                                   stratify = y)

lvl1_est_1 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('logr', LogisticRegression(max_iter = 1000))
]

stacked_2 = StackingClassifier(estimators=lvl1_est_1, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_cvec_2 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('s2', stacked_2)
])

print(cross_val_score(pipe_cvec_2, X_train, y_train).mean())
pipe_cvec_2.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_2.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_2.score(X_test, y_test)}')

0.7298052941269654
Training Score: 0.9478710704337445
Testing Score: 0.7390612569610183


####
### After Running the same, Best Model:

##### It seems the lemmatized titles do not do any better (in fact, ever so slightly worse -- but easily could be b/c of randomness) than the non-lemmatized titles

---
####

### Parts of Speech Investigation:

In [10]:
def return_pos_tag(string):
    '''
    This function uses the .pos_tag to add up the different parts-of-speech totals
    in each string. Then returns those counts so they can be made into different
    columns of a new dataframe.
    '''
    string = string.split()
    adj = 0
    verb = 0
    noun = 0
    adv = 0
    for i in nltk.pos_tag(string):
        if i[1][0] == 'J':
            adj += 1
        elif i[1][0] == 'V':
            verb += 1
        elif i[1][0] == 'N':
            noun += 1
        elif i[1][0] == 'R':
            adv += 1
        else:
            pass
    return [adj, verb, noun, adv]

In [11]:
wnl_df = pd.DataFrame(data = y, index = X.index)
wnl_df['adj_num'] = X.map(lambda j: return_pos_tag(j)[0])
wnl_df['verb_num'] = X.map(lambda j: return_pos_tag(j)[1])
wnl_df['noun_num'] = X.map(lambda j: return_pos_tag(j)[2])
wnl_df['adv_num'] = X.map(lambda j: return_pos_tag(j)[3])

In [12]:
wnl_df.head()

Unnamed: 0,subreddit,adj_num,verb_num,noun_num,adv_num
0,1,0,0,5,0
1,1,0,1,2,0
2,0,0,0,12,0
3,1,1,1,1,0
4,1,1,2,1,0


##
---
#### Modeling on JUST the Parts of Speech Totals

##### Interested to see if does better than Null Model
---

In [13]:
X = wnl_df.drop(columns = 'subreddit')
y = wnl_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
lvl1_est_1 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('ada', AdaBoostClassifier())
]

stacked_1 = StackingClassifier(estimators=lvl1_est_1, final_estimator=LogisticRegression())


print(cross_val_score(stacked_1, X_train, y_train).mean())
stacked_1.fit(X_train, y_train)
print(f'Training Score: {stacked_1.score(X_train, y_train)}')
print(f'Testing Score: {stacked_1.score(X_test, y_test)}')

0.6381483228576107
Training Score: 0.6906751558562143
Testing Score: 0.6471758154335719


###
---
#### These scores reflect only using the sum of the parts of speech for each title for classification. 

##### The model is approximately 14% more accurate with just these parts of speech than the baseline accuracy. Therefore, these features will be included in future modeling.  

---
######

In [14]:
#creating a new dataframe to include the POS counts and adding crucial columns
    #of the original df
new_df= pd.DataFrame(data = X1, index = X1.index)

In [15]:
new_df['adj_num'] = wnl_df['adj_num']
new_df['verb_num'] = wnl_df['verb_num']
new_df['noun_num'] = wnl_df['noun_num']
new_df['adv_num'] = wnl_df['adv_num']
new_df['subreddit'] = df['subreddit']
new_df['selftext'] = df['selftext']
new_df['word_count'] = df['word_count']

In [16]:
new_df.head()

Unnamed: 0,title,adj_num,verb_num,noun_num,adv_num,subreddit,selftext,word_count
0,UFC Fight Pass Streaming Quality,0,0,5,0,1,0,5
1,"Will UFC 284 sell 1,000,000 PPVs?",0,1,2,0,1,0,6
2,The ONE Championship team and CEO Chatri Sityo...,0,0,12,0,0,0,18
3,Yoel look tiny,1,1,1,0,1,0,3
4,Who is your favorite prospect going into 2023?,1,2,1,0,1,0,8


#####
---
#####
#### Applying same stacked model as was found as the best from previous notebook to see if POS helps the overall model

>I have to include a column transformer so CountVectorizer is only applied to the 'title' column  
---
#####

In [21]:
X1 = new_df.drop(columns = 'subreddit')
y1 = new_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X1, y1, stratify = y1, 
                                                    random_state=42)

In [22]:
lvl1_est_1 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('logr', LogisticRegression(max_iter = 1000))
]

stacked_2 = StackingClassifier(estimators=lvl1_est_1, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
ct = ColumnTransformer([
    ('cvec', CountVectorizer(stop_words=stop_words_list), 'title')
])

pipe_cvec_2 = Pipeline([
    ('ct', ct),
    ('s2', stacked_2)
])

print(cross_val_score(pipe_cvec_2, X_train, y_train).mean())
pipe_cvec_2.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_2.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_2.score(X_test, y_test)}')

0.7290099781919566
Training Score: 0.9482690011937923
Testing Score: 0.7386634844868735


Adding the POS to the model seems to have made the model slightly worse. However, this could be due to randomness inherent in the random states that weren't hardcoded. The POS will remain for further analysis later.

####
---
### Sentiment Analysis

##### Using SentimentIntensityAnalyzer for the Negative, Neutral, Positive, and Compound Sentiment Scores on the 'Title' column
---
####

In [38]:
sent = SentimentIntensityAnalyzer()

In [39]:
new_df['neg_sent_score'] = df['title'].map(lambda i: list(sent.polarity_scores(i).values())[0])
new_df['neutral_sent_score'] = df['title'].map(lambda i: list(sent.polarity_scores(i).values())[1])
new_df['pos_sent_score'] = df['title'].map(lambda i: list(sent.polarity_scores(i).values())[2])
new_df['cmpd_sent_score'] = df['title'].map(lambda i: list(sent.polarity_scores(i).values())[3])

In [40]:
new_df.head()

Unnamed: 0,title,adj_num,verb_num,noun_num,adv_num,subreddit,selftext,word_count,neg_sent_score,neutral_sent_score,pos_sent_score,cmpd_sent_score
0,UFC Fight Pass Streaming Quality,0,0,5,0,1,0,5,0.394,0.606,0.0,-0.3818
1,"Will UFC 284 sell 1,000,000 PPVs?",0,1,2,0,1,0,6,0.0,1.0,0.0,0.0
2,The ONE Championship team and CEO Chatri Sityo...,0,0,12,0,0,0,18,0.0,0.854,0.146,0.4404
3,Yoel look tiny,1,1,1,0,1,0,3,0.0,1.0,0.0,0.0
4,Who is your favorite prospect going into 2023?,1,2,1,0,1,0,8,0.0,0.536,0.464,0.6369


In [41]:
new_df.shape

(10053, 12)

###
Using the same model as before but now with the Sentiment Analysis columns added to the dataframe. 

###

In [42]:
X1 = new_df.drop(columns = 'subreddit')
y1 = new_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X1, y1, stratify = y1, 
                                                    random_state=42)

lvl1_est_1 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('logr', LogisticRegression(max_iter = 1000))
]

stacked_2 = StackingClassifier(estimators=lvl1_est_1, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
ct = ColumnTransformer([
    ('cvec', CountVectorizer(stop_words=stop_words_list), 'title')
])

pipe_cvec_2 = Pipeline([
    ('ct', ct),
    ('s2', stacked_2)
])

print(cross_val_score(pipe_cvec_2, X_train, y_train).mean())
pipe_cvec_2.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_2.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_2.score(X_test, y_test)}')

0.7304686001137046
Training Score: 0.9494627934739356
Testing Score: 0.7402545743834527


####
Again, adding the Sentiment Analysis to the model seems to have made the model slightly worse. However, this still could be due to randomness inherent in the random states that weren't hardcoded. Plus, no parameters were tuned to potentially make this model better (specifically cvec).  

####

In [43]:
# I was having trouble with git due to a size issue. So to make sure, I am splitting
    # this dataframe in half when exporting
first_half = new_df.loc[0:5000]
second_half = new_df.loc[5001:]

In [44]:
first_half.to_csv('../data/new_data_part1.csv', index = False)
second_half.to_csv('../data/new_data_part2.csv', index = False)