In [6]:
import re
import time

import numpy as np
import pandas as pd
import requests
from progressbar import progressbar

In [None]:
# !pip3 install progressbar2

## Scraping

### Scraping Functions

In [1]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

headers = {'User-Agent' : 'override this bad boy!'}

def scraper_bike(url):
    posts = []
    after = {}

    for page in progressbar(range(40)):
        params = {'after' : after}
        url = url
        pagepull = requests.get(url = url, params = params, headers = headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        time.sleep(.2)
        
    return posts

In [2]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    i = 0
    post_dict = {}
    
    for post in post_list:
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]
        i += 1

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext'] #'selftext'
    
    return df_name

In [3]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

In [4]:
#### If you want to scrape repeatedly over time and add to a csv
# scrape, import csv, concat, drop duplicate, and output to csv
# takes in scraper function, url, csv filename to import, csv filename to output
# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    scrape_df = posts_to_df(scrape_func(url))
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    concat_df = pd.concat([imported_df, scrape_df])
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    concat_df.to_csv(export_file)

### Run Scrape

In [7]:
# Run this and comment out pd.read_csv lines in data cleaning / preprocessing to use freshly scraped data
# You can also put in any 2 subreddits in as the URL and get results for those

nfltest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nfl.json')
nbatest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nba.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:20 Time:  0:00:20
100% (40 of 40) |########################| Elapsed Time: 0:00:20 Time:  0:00:20


In [10]:
politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:22 Time:  0:00:22
100% (40 of 40) |########################| Elapsed Time: 0:00:20 Time:  0:00:20


In [8]:
nbatest.shape

(733, 3)

In [9]:
nfltest.shape

(926, 3)

In [11]:
nfltest.head()

Unnamed: 0,subreddit,title,selftext
t3_dbwb1y,nfl,"I'm Kimberley Martin, senior NFL writer and Co...","Hi r/NFL, I'm Kimberley Martin, senior NFL wri..."
t3_dbxfyg,nfl,Official Week 4 /r/NFL Power Rankings,"Good afternoon, r/nfl! We're through the first..."
t3_dcajbu,nfl,Tom Brady has been wearing the same shoulder p...,
t3_dc44f6,nfl,[Jaguars] The Jaguars are giving out a bandana...,
t3_dca6d5,nfl,Percy Harvin Says He Was High Every Game He Pl...,


##### These scrape_add functions add to already built csvs

In [None]:
# scrape_add(scraper_bike, 'https://www.reddit.com/r/CollegeBasketball/new.json', 'NCAA_Posts_Update2.csv', 'NCAA_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/AskScience/new.json', 'AskSci_Posts_Update2.csv', 'AskSci_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/nba/new.json', 'NBA_Posts_Update2.csv', 'NBA_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/nfl/new.json', 'NFL_Posts_Update2.csv', 'NFL_Posts_Update3.csv')

### Data Cleaning / Preprocessing

In [12]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option('max_colwidth', 300)

In [13]:
# drop column

nfltest = nfltest.drop(columns = 'selftext')
nbatest = nbatest.drop(columns = 'selftext')

In [22]:
# merge subreddit data

train = pd.concat([nfltest, nbatest])

In [23]:
train.head()

Unnamed: 0,subreddit,title
t3_dbwb1y,nfl,"I'm Kimberley Martin, senior NFL writer and Cover 3 analyst for Yahoo! Sports. AMA!"
t3_dbxfyg,nfl,Official Week 4 /r/NFL Power Rankings
t3_dcajbu,nfl,Tom Brady has been wearing the same shoulder pads since his freshman year at Michigan in 1995. They're older than 5 of his current team-mates.
t3_dc44f6,nfl,[Jaguars] The Jaguars are giving out a bandana and a mustache to any fan who purchases tickets to the team's 2 home games this month
t3_dca6d5,nfl,Percy Harvin Says He Was High Every Game He Played


##### Tokenize (grab only word characters)

In [24]:
word_tokenizer = RegexpTokenizer(r'\w+')

In [25]:
train['title'] = train['title'].map(lambda x: word_tokenizer.tokenize(x.lower()))

In [26]:
# rejoin list of tokenized words into single string for each row

train['title'] = train['title'].map(lambda x: ' '.join(x))

In [27]:
train['title'][0:5]

t3_dbwb1y                                                                 i m kimberley martin senior nfl writer and cover 3 analyst for yahoo sports ama
t3_dbxfyg                                                                                                            official week 4 r nfl power rankings
t3_dcajbu    tom brady has been wearing the same shoulder pads since his freshman year at michigan in 1995 they re older than 5 of his current team mates
t3_dc44f6              jaguars the jaguars are giving out a bandana and a mustache to any fan who purchases tickets to the team s 2 home games this month
t3_dca6d5                                                                                              percy harvin says he was high every game he played
Name: title, dtype: object

### Train test split and converting series to list of strings then to array

In [29]:
X = train[['title']]
y = train['subreddit']

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42,
                                                    stratify=y)

In [35]:
# baseline is

y.value_counts(normalize=True)

nfl    0.558168
nba    0.441832
Name: subreddit, dtype: float64

In [36]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)

In [38]:
clean_train_data[:2]

['sponsors feel nba india may be a slam dunk',
 'los angeles clippers media day press conference live']

In [37]:
len(clean_train_data)

1244

In [39]:
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [40]:
len(clean_test_data)

415

### Count Vectorizer

In [41]:
# instantiate our CountVectorizer. This counts the number of appearances of all the words in our training data and
# eliminates common english stop words. 5000 max features works well for our purposes (tested various numbers). Our
# data is already preprocessed and tokenized manually earlier. ngram_range is 1,3, although all or nearly all our
# features are single words

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

In [42]:
# fit our training data and test data lists to our count_vectorizer

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

In [43]:
# convert to array

train_data_features = train_data_features.toarray()

In [44]:
# check shapes

train_data_features.shape, test_data_features.shape

((1244, 5000), (415, 5000))

In [45]:
# I wanted check that the features corpus was as expected - removed print statement for readability

vocab = vectorizer.get_feature_names()

In [46]:
vocab[0:200]

['00',
 '000',
 '04',
 '10',
 '10 2014',
 '10 points',
 '10 reb',
 '10 yards',
 '100',
 '100 greatest',
 '100 greatest games',
 '1000',
 '101',
 '101 assists',
 '101 assists passing',
 '106',
 '11',
 '12',
 '12 games',
 '13',
 '13 weeks',
 '14',
 '14 17',
 '1400',
 '15',
 '150',
 '150 yard',
 '150 yard receiver',
 '150 yard rusher',
 '16',
 '16 carries',
 '16 games',
 '16 tds',
 '16m',
 '17',
 '17 straight',
 '17 straight games',
 '17 year',
 '179',
 '18',
 '18 65',
 '18 65 mph',
 '18 mph',
 '18 mph vs',
 '18 snapped',
 '18 snapped streaks',
 '19',
 '19 matt',
 '19 matt ryan',
 '19 season',
 '1950',
 '1969',
 '1970',
 '1970 merger',
 '1998',
 '1999',
 '1st',
 '1st 4th',
 '1st place',
 '20',
 '20 years',
 '200',
 '200 snaps',
 '2000',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2008',
 '2010',
 '2010 finals',
 '2010s',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2018 19',
 '2018 2019',
 '2019',
 '2019 20',
 '2019 nba',
 '2019 nba media',
 '2019 nfl',
 '2

## MODELING

### Logistic Regression

In [113]:
from sklearn.linear_model import LogisticRegression

In [114]:
# fit logistic regression model

lr = LogisticRegression(penalty='l2')

In [115]:
# shape check

train_data_features.shape, y_train.shape

((1244, 5000), (1244,))

In [116]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [117]:
lr.score(train_data_features, y_train)

0.9887459807073955

In [118]:
lr.score(test_data_features, y_test)

0.8843373493975903

### Feature comparison

Creates a dataframe that matches features to coefficients

In [53]:
coef_list = lr.coef_.tolist()

In [54]:
coef_list = coef_list[0]

In [55]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

In [57]:
coef_df.sort_values(by = ['coefs']).head()

Unnamed: 0,features,coefs
1975,nba,-2.585113
1385,kawhi,-1.399729
1444,lakers,-1.279566
340,basketball,-1.212108
929,finals,-1.144695


### Let's throw out these unfair words and rerun

In [58]:
stopwords = set(stopwords.words('english'))

extra_stopwords = ['nba', 'basketball', 'football', 'nfl']

stopwords.update(extra_stopwords)

In [59]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=stopwords,
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1244, 5000), (415, 5000))

In [60]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [61]:
lr.score(train_data_features, y_train)

0.9887459807073955

In [62]:
lr.score(test_data_features, y_test)

0.8843373493975903

In [65]:
coef_list = lr.coef_.tolist()
coef_list = coef_list[0]

coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

coef_df.sort_values(by=['coefs']).head()

Unnamed: 0,features,coefs
1449,kawhi,-1.354888
1509,lakers,-1.288586
961,finals,-1.273403
1559,lebron,-1.138784
4843,warriors,-1.064839


### Decision Tree

In [66]:
from sklearn.tree import DecisionTreeClassifier

In [67]:
tree = DecisionTreeClassifier()

In [68]:
tree.fit(train_data_features, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [69]:
tree.score(train_data_features, y_train)

1.0

In [70]:
tree.score(test_data_features, y_test)

0.7807228915662651

### Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [81]:
forest = RandomForestClassifier(n_estimators=100)

In [82]:
forest.fit(train_data_features, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [83]:
forest.score(train_data_features, y_train)

1.0

In [84]:
forest.score(test_data_features, y_test)

0.8843373493975903

###  Matrix on Logistic Regression

In [119]:
from sklearn.metrics import confusion_matrix

In [120]:
y_pred = lr.predict(test_data_features)

In [121]:
cm = confusion_matrix(y_test, y_pred)

In [122]:
cm_df = pd.DataFrame(cm,
                    columns=['predict_neg', 'predict_pos'],
                    index = ['actual_neg', 'actual_pos'])

In [123]:
cm_df

Unnamed: 0,predict_neg,predict_pos
actual_neg,161,22
actual_pos,26,206


## Checking where our model failed

In [124]:
comparison_df = pd.DataFrame({'y_actual' : y_test,
             'y_predicted' : y_pred})

In [125]:
mismatch_df = comparison_df[comparison_df['y_actual'] != comparison_df['y_predicted']]

In [126]:
mismatch2_df = pd.concat([mismatch_df, X_test], axis = 1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [127]:
# All incorrect predictions with titles

mismatches = mismatch2_df.dropna()

In [128]:
mismatches

Unnamed: 0,y_actual,y_predicted,title
t3_d9o0q3,nba,nfl,did you know in 1954 the lakers and hawks played in an experimental regular season games where free throws weren t taken during the 1st and 3rd quarters but accumulated and taken at the end of the quarters
t3_d9yxo4,nba,nfl,regular season all time scoring leader board offensive goats 1946 2019
t3_da2ict,nba,nfl,kyrie explains in depth what went wrong in boston says he failed them as a leader
t3_da2vkd,nba,nfl,feigen asked why he sought to get d antoni an extension but does not consider it necessary now fertitta said mike has representation i have my representation in daryl morey they could not come to terms
t3_da3j4w,nba,nfl,highlight pj tucker drops doncic with a between the legs crossover
t3_dakzg1,nba,nfl,who hit the most game winners at the buzzer regular season and playoffs combined
t3_daotbc,nba,nfl,alex caruso on catching people off guard with athletic plays it s sneaky people say it s sneaky athleticism for the white guy it s all about sneaking up on them which makes it even more special
t3_davzae,nfl,nba,kempski nfl should discipline andrew sendejo for reckless friendly fire shot on avonte maddox
t3_dazc5e,nba,nfl,compared the nfl regular season the nba reg season feels more like one long pre season what can be done to change that
t3_db3x9a,nba,nfl,no stupid questions can a player receive a tech after the game ends


### Let's try TF-IDF

Term Frequency / Inverse Document Frequency

TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

IDF(w) = log_e(Total number of documents / Number of documents with term w in it)

In [141]:
tfidf_vec = TfidfVectorizer(analyzer="word",
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=list(s_words).extend(['nba', 'nfl', 'basketball', 'football']),
                            max_features=5000,
                            ngram_range=(1, 3))

In [142]:
train_data_features = tfidf_vec.fit_transform(clean_train_data)

test_data_features = tfidf_vec.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1244, 5000), (415, 5000))

In [144]:
train_data_features[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [145]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [146]:
lr.score(train_data_features, y_train)

0.9887459807073955

In [147]:
lr.score(test_data_features, y_test)

0.8867469879518072

### Let's try on some other subreddits

In [148]:
train = pd.concat([politics_test, conservative_test])

In [149]:
X = train[['title']]
y = train['subreddit']

In [150]:
# politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
# conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [152]:
politics_test = politics_test.drop(columns = 'selftext')
conservative_test = conservative_test.drop(columns = 'selftext')

train = pd.concat([politics_test, conservative_test])
tokenizer = RegexpTokenizer(r'\w+')

train['title'] = train['title'].map(lambda x: tokenizer.tokenize(x.lower()))
train['title'] = train['title'].map(lambda x: ' '.join(x))

In [153]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)
    
    
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [154]:
from nltk.corpus import stopwords
s_words = set(stopwords.words('english') + stopwords.words('spanish'))

In [155]:
s_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'al',
 'algo',
 'algunas',
 'algunos',
 'all',
 'am',
 'an',
 'and',
 'ante',
 'antes',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'como',
 'con',
 'contra',
 'couldn',
 "couldn't",
 'cual',
 'cuando',
 'd',
 'de',
 'del',
 'desde',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'donde',
 'down',
 'durante',
 'during',
 'e',
 'each',
 'el',
 'ella',
 'ellas',
 'ellos',
 'en',
 'entre',
 'era',
 'erais',
 'eran',
 'eras',
 'eres',
 'es',
 'esa',
 'esas',
 'ese',
 'eso',
 'esos',
 'esta',
 'estaba',
 'estabais',
 'estaban',
 'estabas',
 'estad',
 'estada',
 'estadas',
 'estado',
 'estados',
 'estamos',
 'estando',
 'estar',
 'estaremos',
 'estará',
 'estarán',
 'estarás',
 'estaré',
 'estaréis',
 'estaría',
 'estaríais',
 'estaríamos',
 'estarían',
 'estarías',
 'es

In [156]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = s_words,
                             max_features = 5000,
                             ngram_range = (1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

vocab = vectorizer.get_feature_names()

### Modeling

In [157]:
lr = LogisticRegression(penalty='l2')

In [158]:
train_data_features.shape, y_train.shape

((1387, 5000), (1387,))

In [159]:
lr.fit(train_data_features, y_train)

lr.score(train_data_features, y_train)



0.9798125450612833

In [160]:
lr.score(test_data_features, y_test)

0.7883369330453563

In [161]:
coef_list = lr.coef_.tolist()

coef_list = coef_list[0]

In [162]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

coef_df.sort_values(by=['coefs'])

Unnamed: 0,features,coefs
304,biden,-1.224174
750,dems,-1.110635
1217,hillary,-1.032156
4261,schiff,-0.979207
2524,pelosi,-0.957020
743,democrats subpoena,-0.880528
1284,illegal,-0.857690
596,complaint,-0.857276
410,california,-0.854052
704,deduction,-0.849851


In [179]:
lr.classes_

array(['Conservative', 'politics'], dtype=object)

In [190]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1000)
pca.fit(train_data_features)
print(sum(pca.explained_variance_ratio_))
# print(pca.singular_values_)  


0.9691784473039958


In [191]:
x_pred = pca.transform(train_data_features)

In [195]:
x_pred_test = pca.transform(test_data_features.toarray())

In [192]:
lr.fit(x_pred, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [197]:
lr.score(x_pred, y_train)

0.9509733237202596

In [198]:
lr.score(x_pred_test, y_test)

0.7883369330453563