## PushShift
#### Pulling Data for r/MMA and r/UFC
---


In [1]:
import requests
import calendar
import time
import pandas as pd
from bs4 import BeautifulSoup
import re

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
# doing a preliminary pull to see what I need to filter out in future pulls
# and in helper functions
params = {
    'subreddit': 'UFC, MMA',
    'limit': 10,
}
res = requests.get(url, params)
print(res.status_code)
df = pd.DataFrame(res.json()['data'])


200


In [4]:
#checking out all the data provided to narrow down the filter param
pd.set_option('display.max_rows', None)

df.tail().T

Unnamed: 0,5,6,7,8,9
subreddit,ufc,MMA,MMA,MMA,ufc
selftext,,,,,
author_fullname,t2_w4cesjd,t2_k8vn780f,t2_cvm3whwy,t2_60j1e7n6,t2_6fb92ovz
gilded,0,0,0,0,0
title,Afro burns 👀,What the fuc…….?,. @RazorBlaydes265 has no hard feelings toward...,Blaydes believes he'll have an advantage over ...,Who deserves the next title shot if a winner i...
link_flair_richtext,[],[],[],[],[]
subreddit_name_prefixed,r/ufc,r/MMA,r/MMA,r/MMA,r/ufc
hidden,False,False,False,False,False
pwls,6,6,6,6,6
link_flair_css_class,,img,,img,


In [5]:
# pulling different features of interest through each call

def get_subreddit_mma_ufc_data(until):
    '''
    function that returns a dataframe with 1000 entries using
    api pushshift params. The UNTIL parameter is used to help with pulling non-overlapping
    entries.
    '''
    
    params = {
        'subreddit': 'UFC, MMA',
        'limit': 1000,
        'filter': 'subreddit, id, selftext, title, media, media_embed, num_comments, created_utc',
        'until': until
    }
    res = requests.get(url, params)
    if res.status_code == 200:
        df = pd.DataFrame(res.json()['data'])
        return df

In [6]:
#this UNTIL argument was the time (in utc) when it was first used
df01 = get_subreddit_mma_ufc_data(1673205797)
df01.tail(1)

Unnamed: 0,subreddit,selftext,title,media_embed,id,num_comments,created_utc,media
999,MMA,,Phil Baroni arrested for murder of his girlfri...,{},102nfz8,0,1672789239,


In [7]:
#the UNTIL arguments were found after calling these separately and looking at the
# .tail(1) of the dfs and selecting the last created_utc.
df02 = get_subreddit_mma_ufc_data(1672789239)
df03 = get_subreddit_mma_ufc_data(1672318448)
df04 = get_subreddit_mma_ufc_data(1671808988)
df05 = get_subreddit_mma_ufc_data(1671484146)
df06 = get_subreddit_mma_ufc_data(1671116165)
df07 = get_subreddit_mma_ufc_data(1670800057)
df08 = get_subreddit_mma_ufc_data(1670737008)

In [8]:
#combining into a comprehensive df

df_list = [df01, df02, df03, df04, df05, df06, df07, df08]

df_total = pd.concat(df_list, ignore_index = True)

In [9]:
#looking at the proportions of data for each subreddit
df_total['subreddit'].value_counts(normalize = True)

ufc    0.67975
MMA    0.32025
Name: subreddit, dtype: float64

In [10]:
#trying to even out the proportions. Writing a similar function to the previous,
# but only for data from r/MMA

def just_mma_data(until):
    '''
    To get more even number of data, I am searching for only MMA data
    '''
    params2 = {
        'subreddit': 'MMA',
        'limit': 1000,
        'filter': 'subreddit, id, selftext, title, media, media_embed, num_comments, created_utc',
        'until': until
    }
    res = requests.get(url, params2)
    if res.status_code == 200:
        df = pd.DataFrame(res.json()['data'])
        return df

In [11]:
df09 = just_mma_data(1670598695)
df10 = just_mma_data(1669478929)
df11 = just_mma_data(1668574869)
df = pd.concat([df_total, df09, df10, df11], ignore_index = True)

In [12]:
# ensuring similar counts from each subreddit
df['subreddit'].value_counts(normalize = True)

MMA    0.505636
ufc    0.494364
Name: subreddit, dtype: float64

In [13]:
#checking for reposts
    #therfore only checking title duplicates
    
df['title'].duplicated().sum()

947

In [14]:
df.drop_duplicates(subset = 'title', inplace = True)

In [15]:
df.shape

(10053, 8)

In [16]:
df.reset_index(inplace = True, drop = True)

In [17]:
# Trying to figure out if any of the retreived posts are ads or promotions
    # by checking to see if certain words are present

words = ['[Ss]ponsor', '\s[Aa]d\s', '[Pp]romoted']
counters = []
for word in words:
    counter = 0
    for i in df['title']:
        if len(re.findall(word, i)) != 0:
            counter +=1
    counters.append(counter)
pd.DataFrame(data = counters, index = words)

Unnamed: 0,0
[Ss]ponsor,7
\s[Aa]d\s,7
[Pp]romoted,3


In [18]:
# Making a quick df to be able to see the indexes for my main df

words = ['[Ss]ponsor', '\s[Aa]d\s', '[Pp]romoted']
indexes = []
for word in words:
    index = []
    for i in range(0, len(df['title'])-1):
        if len(re.findall(word, df['title'][i])) != 0:
            index.append(i)
    indexes.append(index)
ad_eda = pd.DataFrame(data = indexes, index = words).T
ad_eda.fillna(0, inplace = True)
ad_eda.astype(int)

Unnamed: 0,[Ss]ponsor,\s[Aa]d\s,[Pp]romoted
0,4283,577,436
1,4393,580,1637
2,4415,589,9253
3,5624,3721,0
4,6701,4253,0
5,7642,5673,0
6,7766,6531,0


In [19]:
# Looking at various entries to see if the wordsearches were indeed ads

for i in ad_eda.columns:
    for j in range(0, 6, 3):
        if ad_eda[i][j] != 0:
            print(df['title'][(ad_eda[i][j])]+'\n')

Any fighters here have an upcoming fight? I just won $200 in a bet with a friend but the bet had a stipulation: It has to be donated somewhere. I'll spend it to sponsor a fighter if they agree to give a shoutout to whatever local animal shelter they choose.

How will this affect betting sponsors in UFC?

Jeremy Botter on Twitter: A Warner Media source tells me Dana White’s Power Slap series is no longer listed on any internal programming schedules and all ad spots have been dropped. The death of the deal, I’m told, is basically just paperwork at this point.

Not sure if you guys saw this ad but I couldn’t stop laughing 🤣

UFC Champ Zhang Weili promoted to BJJ Brown belt



> It doesnt look like these are ads or promotions, which is excellent

In [20]:
df[df['selftext'] == ''].count()

subreddit       7408
selftext        7408
title           7408
media_embed     7408
id              7408
num_comments    7408
created_utc     7408
media           1981
dtype: int64

Approximately 75% of the observations have nothing in the 'selftext' colunmn

In [21]:
# Interesting that subreddit MMA has many more [removed] selftexts. Which could be used in
#    a model. So instead of replacing with an empty string, I will keep [removed] in.
df['selftext'][df['selftext'] == '[removed]'].groupby(df['subreddit']).count()

subreddit
MMA    928
ufc    296
Name: selftext, dtype: int64

In [22]:
df['selftext'] = df['selftext'].map(lambda x: 1 if x =='[removed]' else 0)
df['selftext'].value_counts(normalize=True)

0    0.878245
1    0.121755
Name: selftext, dtype: float64

######
---
######
#### Checking for UFC-Specific words as well as the other MMA league, Bellator
######

In [23]:
words = ['[Dd]ana', '[Ww]hite', '[Uu][Ff][Cc]', '[Uu]ltimate', '[Bb]ellator', '[Mm]c[Gg]regor', '[Aa]desanya', '[Pp]addy']
indexes = []
for word in words:
    index = []
    for i in range(0, len(df['title'])-1):
        if len(re.findall(word, df['title'][i])) != 0:
            index.append(i)
    indexes.append(index)
ufc_eda = pd.DataFrame(data = indexes, index = words).T
ufc_eda = ufc_eda.fillna(0)
ufc_eda = ufc_eda.astype(int)


In [24]:
ufc_eda[ufc_eda != 0].count()

[Dd]ana            434
[Ww]hite           232
[Uu][Ff][Cc]      1992
[Uu]ltimate         13
[Bb]ellator        148
[Mm]c[Gg]regor     188
[Aa]desanya        186
[Pp]addy           506
dtype: int64

In [25]:
for i in ufc_eda.columns:
    for j in range(0, 1993, 150):
        if ufc_eda[i][j] != 0:
            print(df['title'][(ufc_eda[i][j])]+'|'+df['subreddit'][(ufc_eda[i][j])]+'\n')

Sean O'Malley claims Dana White's wife 'DESERVED a slapping back' after domestic violence incident|MMA

Dana white slaps his wife in the face|ufc

Dana White and the barstool crew react to the co-main event decision|ufc

Sean O'Malley claims Dana White's wife 'DESERVED a slapping back' after domestic violence incident|MMA

Ilia Topuria when asking Dana White to give him Paddy Pimblett.|ufc

Jon Jones declares he will become UFC champion in 2023|MMA

How is the UFC not showing a megacard tonight?|ufc

Report: UFC’s Jeff Molina the latest suspension amid betting scandal|MMA

UFC Roster Watch: Fighter Removed: Deron Winn|MMA

Why was Mustafaev cut from the UFC? He KOed Fiziev and all the fights that fell through weren't his fault.|ufc

Cody “No Love” Garbrandt makes his return at UFC 285 on March 4th against Julio Arce|ufc

This Fight was supposed to happen at UFC 270. How would it have gone and will we still get to see it down the line?|ufc

First Suga Sean and now Paddy? Is UFC rigged?|

In [26]:
words = ['[Dd]ana', '[Ww]hite', '[Uu][Ff][Cc]', '[Uu]ltimate', '[Bb]ellator', '[Mm]c[Gg]regor', '[Aa]desanya', '[Pp]addy']
indexes = []
for word in words:
    index = []
    for i in range(0, len(df['title'])-1):
        if len(re.findall(word, df['title'][i])) != 0:
            if df['subreddit'][i] == 'MMA':
                index.append(i)
    indexes.append(index)
mma_eda = pd.DataFrame(data = indexes, index = words).T
mma_eda = mma_eda.fillna(0)
mma_eda = mma_eda.astype(int)

In [27]:
mma_eda[mma_eda != 0].count()

[Dd]ana            129
[Ww]hite           108
[Uu][Ff][Cc]      1126
[Uu]ltimate          7
[Bb]ellator        140
[Mm]c[Gg]regor     116
[Aa]desanya        169
[Pp]addy           130
dtype: int64

In [29]:
df['subreddit'].value_counts()

MMA    5057
ufc    4996
Name: subreddit, dtype: int64

In [29]:
1126/5057

0.22266165710895788

22.3% of ALL r/mma titles includes the word 'ufc'

---

In [28]:
df['subreddit'].value_counts(normalize = True)

MMA    0.503034
ufc    0.496966
Name: subreddit, dtype: float64

###
#### This represents the Null Model Accuracy, which is 50.3%
###

##### In 22.27% of all MMA subreddit posts, 'UFC' is present

In [30]:
#Making the word count as a separate column
df['word_count'] = df['title'].map(lambda i: len(i.split(' ')))

In [31]:
# Binarizing subreddits
df['subreddit'] = df['subreddit'].map({'ufc':1, 'MMA': 0})

In [32]:
#exporting to csv for further preprocessing and modeling

df.to_csv('../data/ufc_mma_submissions.csv', index = False)