In [1]:
import re
import time

import numpy as np
import pandas as pd
import requests

In [8]:
!pip install progressbar2
from progressbar import progressbar



## Scraping

### Scraping Functions

In [3]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

def scraper_bike(url):
    headers = {'User-Agent' : 'override this bad boy!'}
    posts = []
    after = {}

    for page in progressbar(range(40)):
        params = {'after': after}
        pagepull = requests.get(url=url, params=params, headers=headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        time.sleep(.2)
        
    return posts

In [4]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)


def posts_to_df(post_list):
    i = 0
    post_dict = {}

    for post in post_list:
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]
        i += 1

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext']

    return df_name

In [5]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

## Run Scrape

In [6]:
# You can also put in any 2 subreddits in as the URL and get results for those

nfltest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nfl.json')
nbatest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nba.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:27 Time:  0:00:27
100% (40 of 40) |########################| Elapsed Time: 0:00:27 Time:  0:00:27


In [7]:
politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:29 Time:  0:00:29
100% (40 of 40) |########################| Elapsed Time: 0:00:27 Time:  0:00:27


In [9]:
nbatest.shape

(734, 3)

In [11]:
nfltest.head()

Unnamed: 0,subreddit,title,selftext
t3_i77jh7,nfl,Weekend Wrapup,"Welcome to today's open thread, where /r/nfl users can discuss anything they wish not related directly to the NFL.\n\nWant to talk about personal life? Cool things about your fandom? Whatever happens to be dominating today's news cycle? Do you have something to talk about that didn't warrant its..."
t3_i75e2m,nfl,2020 Offseason Review Series Day 26: The New England Patriots,"#New England Patriots \nDivision: AFC East \n2019: 12-4, Division Win\n \nBefore I dive in, I want to give a massive, massive thanks to /u/timnog - who is a national treasure and the resident .gif queen of the Patriots - and /u/arbrown83 - who provides excellent high-quality OC and manages pa..."
t3_i7611z,nfl,[Dan Patrick Show] DP was told an hour ago that the Big 10 and Pac 12 will cancel their football seasons tomorrow... The ACC and the Big 12 are on the fence.. And the SEC is trying to get teams to join them for a season.,
t3_i73kp0,nfl,"[Highlight] NFL's Greatest Moments of the 2010s: Josh Gordon becomes the first player in NFL history with over 200 receiving yards in consecutive games, with 498 total against the Steelers and Jaguars (2013 Week 13)",
t3_i72dt6,nfl,"[AdamSchefter] Chiefs and Texans open the NFL season Thursday, Sept. 10, one month from tonight.",


In [12]:
nfltest.shape

(897, 3)

### Data Cleaning / Preprocessing

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option('max_colwidth', 300)

In [13]:
# drop column

nfltest = nfltest.drop(columns='selftext')
nbatest = nbatest.drop(columns='selftext')

In [14]:
# merge subreddit data

train = pd.concat([nfltest, nbatest])

In [15]:
train

Unnamed: 0,subreddit,title
t3_i77jh7,nfl,Weekend Wrapup
t3_i75e2m,nfl,2020 Offseason Review Series Day 26: The New England Patriots
t3_i7611z,nfl,[Dan Patrick Show] DP was told an hour ago that the Big 10 and Pac 12 will cancel their football seasons tomorrow... The ACC and the Big 12 are on the fence.. And the SEC is trying to get teams to join them for a season.
t3_i73kp0,nfl,"[Highlight] NFL's Greatest Moments of the 2010s: Josh Gordon becomes the first player in NFL history with over 200 receiving yards in consecutive games, with 498 total against the Steelers and Jaguars (2013 Week 13)"
t3_i72dt6,nfl,"[AdamSchefter] Chiefs and Texans open the NFL season Thursday, Sept. 10, one month from tonight."
...,...,...
t3_i5b1g8,nba,8th seed West probabilities from Basketball Reference's simulations
t3_i5h3l7,nba,Coolest NBA related artwork/portraits for an office wall?
t3_i5fed7,nba,"For Next Season, If You Could Add, Eliminate or Modify One NBA Rule/Regulation To Help Improve The League, What Would That Be?"
t3_i79f8j,nba,James Harden is better than Steph Curry.


##### Tokenize (grab only word characters)

In [16]:
word_tokenizer = RegexpTokenizer(r'\w+')

In [None]:
print(r'Hello\nWorld')
print('Hello\nWorld')

Word tokenize

In [22]:
train = pd.concat([nfltest, nbatest])


In [None]:
train['title'] = train['title'].map(lambda x: word_tokenizer.tokenize(x.lower()))

In [18]:
train['title'][0:5]

t3_i77jh7                                                                                                                                                                                                                                                      [weekend, wrapup]
t3_i75e2m                                                                                                                                                                                                [2020, offseason, review, series, day, 26, the, new, england, patriots]
t3_i7611z    [dan, patrick, show, dp, was, told, an, hour, ago, that, the, big, 10, and, pac, 12, will, cancel, their, football, seasons, tomorrow, the, acc, and, the, big, 12, are, on, the, fence, and, the, sec, is, trying, to, get, teams, to, join, them, for, a, season]
t3_i73kp0                 [highlight, nfl, s, greatest, moments, of, the, 2010s, josh, gordon, becomes, the, first, player, in, nfl, history, with, over, 200, receiving, yards, in, 

With TweetTokenizer

In [19]:
tknzr = TweetTokenizer()

In [23]:
train['title'] = train['title'].map(lambda x: tknzr.tokenize(x.lower()))

In [24]:
train['title']

t3_i77jh7                                                                                                                                                                                                                                                                        [weekend, wrapup]
t3_i75e2m                                                                                                                                                                                                               [2020, offseason, review, series, day, 26, :, the, new, england, patriots]
t3_i7611z    [[, dan, patrick, show, ], dp, was, told, an, hour, ago, that, the, big, 10, and, pac, 12, will, cancel, their, football, seasons, tomorrow, ..., the, acc, and, the, big, 12, are, on, the, fence, .., and, the, sec, is, trying, to, get, teams, to, join, them, for, a, season, .]
t3_i73kp0                  [[, highlight, ], nfl's, greatest, moments, of, the, 2010s, :, josh, gordon, becomes, the, first, pl

In [25]:
# rejoin list of tokenized words into single string for each row

train['title'] = train['title'].map(lambda x: ' '.join(x))

In [26]:
train['title'][0:5]

t3_i77jh7                                                                                                                                                                                                                       weekend wrapup
t3_i75e2m                                                                                                                                                                       2020 offseason review series day 26 : the new england patriots
t3_i7611z    [ dan patrick show ] dp was told an hour ago that the big 10 and pac 12 will cancel their football seasons tomorrow ... the acc and the big 12 are on the fence .. and the sec is trying to get teams to join them for a season .
t3_i73kp0        [ highlight ] nfl's greatest moments of the 2010s : josh gordon becomes the first player in nfl history with over 200 receiving yards in consecutive games , with 498 total against the steelers and jaguars ( 2013 week 13 )
t3_i72dt6                                   

### Train test split and converting series to list of strings then to array

In [27]:
X = train[['title']]
y = train['subreddit']

In [31]:
X

Unnamed: 0,title
t3_i77jh7,weekend wrapup
t3_i75e2m,2020 offseason review series day 26 : the new england patriots
t3_i7611z,[ dan patrick show ] dp was told an hour ago that the big 10 and pac 12 will cancel their football seasons tomorrow ... the acc and the big 12 are on the fence .. and the sec is trying to get teams to join them for a season .
t3_i73kp0,"[ highlight ] nfl's greatest moments of the 2010s : josh gordon becomes the first player in nfl history with over 200 receiving yards in consecutive games , with 498 total against the steelers and jaguars ( 2013 week 13 )"
t3_i72dt6,"[ adamschefter ] chiefs and texans open the nfl season thursday , sept . 10 , one month from tonight ."
...,...
t3_i5b1g8,8th seed west probabilities from basketball reference's simulations
t3_i5h3l7,coolest nba related artwork / portraits for an office wall ?
t3_i5fed7,"for next season , if you could add , eliminate or modify one nba rule / regulation to help improve the league , what would that be ?"
t3_i79f8j,james harden is better than steph curry .


In [33]:
y.value_counts(normalize=True)

nfl    0.549969
nba    0.450031
Name: subreddit, dtype: float64

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42,
                                                    stratify=y)

In [35]:
# baseline is

y.value_counts(normalize=True)

nfl    0.549969
nba    0.450031
Name: subreddit, dtype: float64

In [36]:
y_train.value_counts(normalize=True)

nfl    0.550286
nba    0.449714
Name: subreddit, dtype: float64

In [38]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)

In [39]:
len(clean_train_data)

1223

In [40]:
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [41]:
len(clean_test_data)

408

In [42]:
clean_train_data

['[ yates ] the 49ers have signed former bills ol spencer long , per his agent @jlsports3 . some interior depth and versatility .',
 'gary trent jr . finishes with 27 points on 10/15 shooting , 7/10 from downtown',
 '[ rapoport ] #bucs coach bruce arians tells reporters that “ rojo ’ s the main guy . he ’ ll carry the load . ” arians added that lesean mccoy should contribute in the passing game .',
 'weekend wrapup',
 '[ rap ] washington rb derrius guice came into the league with serious character concerns from his time at lsu . there was an incident teams were scurrying to research right before the draft , leading to his fall out of the first round . after these charges , will guice play football again ?',
 "joey bosa's deal is a reminder that danielle hunter is a massive bargain",
 '[ highlight ] luka somehow finds delon wright on the cut for the easy basket',
 "[ keim ] according to the loudoun county sheriff's dept the charges against guice stemmed from 3 separate domestic related 

### Count Vectorizer

In [55]:
# instantiate our CountVectorizer. This counts the number of appearances of all the words in our training data and
# eliminates common english stop words. 5000 max features works well for our purposes (tested various numbers). Our
# data is already preprocessed and tokenized manually earlier. ngram_range is 1,3, although all or nearly all our
# features are single words

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
#                              max_df= 0.9,
#                              min_df= 0.001,
                             ngram_range=(1, 3))

In [56]:
# fit our training data and test data lists to our count_vectorizer

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

In [57]:
train_data_features

<1223x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 16670 stored elements in Compressed Sparse Row format>

In [59]:
16670 / (1223*5000)

0.0027260834014717905

In [60]:
# convert to array

train_data_features = train_data_features.toarray()

In [61]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [63]:
pd.DataFrame(train_data_features, columns=vectorizer.get_feature_names())

Unnamed: 0,000,000 refusing,000 refusing covid,03,07,07 2020,08,08 2020,09,09 2020,...,zero,zhao,zhao packers,zhao packers matt,ziggy,ziggy ansah,zimmer,zion,zoom,zubac
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1219,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
train_data_features.shape

(1223, 5000)

In [65]:
# check shapes

train_data_features.shape, test_data_features.shape

((1223, 5000), (408, 5000))

In [66]:
# I wanted check that the features corpus was as expected - removed print statement for readability

vocab = vectorizer.get_feature_names()

In [67]:
vocab

['000',
 '000 refusing',
 '000 refusing covid',
 '03',
 '07',
 '07 2020',
 '08',
 '08 2020',
 '09',
 '09 2020',
 '10',
 '10 2020',
 '10 games',
 '10 million',
 '10 million lawsuit',
 '10 years',
 '100',
 '100 nfl',
 '100 nfl players',
 '100 time',
 '100 time team',
 '105',
 '108',
 '10th',
 '11',
 '11 57',
 '11 57 contract',
 '111',
 '112',
 '113',
 '119',
 '11m',
 '11m reduced',
 '11m reduced 2022',
 '12',
 '12 24',
 '12 24 shooting',
 '12 assists',
 '12 vinny',
 '12 vinny testaverde',
 '121',
 '121 92',
 '125',
 '13',
 '13 million',
 '13 times',
 '132',
 '14',
 '14 19',
 '14 rebounds',
 '14 rebounds 12',
 '14 reduced',
 '14 reduced 2021',
 '145',
 '15',
 '157',
 '157 pounds',
 '157 pounds 180',
 '15th',
 '16',
 '16 17',
 '16 17 fgs',
 '16 games',
 '17',
 '17 deficit',
 '17 deficit cowboys',
 '17 fgs',
 '17 fgs 2018',
 '17 unanswered',
 '17 unanswered 4th',
 '18',
 '180',
 '180 pounds',
 '19',
 '19 26',
 '19 26 missed',
 '19 assists',
 '19 assists 14',
 '19 deaths',
 '19 deaths inevit

## MODELING

### Logistic Regression

In [68]:
from sklearn.linear_model import LogisticRegression

In [69]:
# fit logistic regression model

lr = LogisticRegression()

In [70]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [72]:
lr.score(train_data_features, y_train)

0.9950940310711366

In [73]:
lr.score(test_data_features, y_test)

0.9411764705882353

### Feature comparison

Creates a dataframe that matches features to coefficients

In [74]:
coef_list = lr.coef_.tolist()

In [75]:
coef_list = coef_list[0]

In [76]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

In [77]:
coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
2853,nba,-2.695871
878,bubble,-1.462409
2635,luka,-1.262418
2495,lebron,-1.187575
1276,dame,-1.119529
...,...,...
3053,opt,1.162981
3515,raiders,1.249284
3835,saints,1.392999
1753,football,1.496035


### Let's throw out these unfair words and rerun

In [91]:
stop_words = set(stopwords.words('english'))

extra_stopwords = ['nba', 'basketball', 'football', 'nfl']

stop_words.update(extra_stopwords)

In [80]:
clean_train_data[:10]

['[ yates ] the 49ers have signed former bills ol spencer long , per his agent @jlsports3 . some interior depth and versatility .',
 'gary trent jr . finishes with 27 points on 10/15 shooting , 7/10 from downtown',
 '[ rapoport ] #bucs coach bruce arians tells reporters that “ rojo ’ s the main guy . he ’ ll carry the load . ” arians added that lesean mccoy should contribute in the passing game .',
 'weekend wrapup',
 '[ rap ] washington rb derrius guice came into the league with serious character concerns from his time at lsu . there was an incident teams were scurrying to research right before the draft , leading to his fall out of the first round . after these charges , will guice play football again ?',
 "joey bosa's deal is a reminder that danielle hunter is a massive bargain",
 '[ highlight ] luka somehow finds delon wright on the cut for the easy basket',
 "[ keim ] according to the loudoun county sheriff's dept the charges against guice stemmed from 3 separate domestic related 

In [88]:
clean_test_data[0]

'[ nfl ] this @ravens rookie skit of @shannonsharpe from the first hard knocks never gets old . 🤣 #hardknockshof ( via @nflfilms ) 📺 : hard knocks : los angeles starts august 11 on @hbo'

In [89]:
type(_)

str

In [90]:
type(stopwords)

nltk.corpus.util.LazyCorpusLoader

In [92]:
vectorizer = CountVectorizer(stop_words=stop_words,
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1223, 5000), (408, 5000))

In [93]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [94]:
lr.score(train_data_features, y_train)

0.9950940310711366

In [95]:
lr.score(test_data_features, y_test)

0.9068627450980392

In [96]:
coef_list = lr.coef_.tolist()
coef_list = coef_list[0]

coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
863,bubble,-1.403742
2683,luka,-1.396669
2537,lebron,-1.305326
1267,dame,-1.178183
1935,giannis,-1.039678
...,...,...
3520,qb,1.084723
4776,week,1.098247
3554,raiders,1.162668
3104,opt,1.367016


### Decision Tree

In [97]:
from sklearn.tree import DecisionTreeClassifier

In [98]:
tree = DecisionTreeClassifier()

In [99]:
tree.fit(train_data_features, y_train)

DecisionTreeClassifier()

In [100]:
tree.score(train_data_features, y_train)

1.0

In [101]:
tree.score(test_data_features, y_test)

0.7622549019607843

### Random Forest

In [102]:
from sklearn.ensemble import RandomForestClassifier

In [111]:
forest = RandomForestClassifier(n_estimators=100)

In [112]:
forest.fit(train_data_features, y_train)

RandomForestClassifier()

In [113]:
forest.score(train_data_features, y_train)

1.0

In [114]:
forest.score(test_data_features, y_test)

0.8578431372549019

###  Matrix on Logistic Regression

In [115]:
from sklearn.metrics import confusion_matrix

In [116]:
y_pred = lr.predict(test_data_features)

In [117]:
cm = confusion_matrix(y_test, y_pred)

In [118]:
cm_df = pd.DataFrame(cm,
                    columns=['predict_neg', 'predict_pos'],
                    index = ['actual_neg', 'actual_pos'])

In [119]:
cm_df

Unnamed: 0,predict_neg,predict_pos
actual_neg,168,16
actual_pos,22,202


## Checking where our model failed

In [120]:
comparison_df = pd.DataFrame({'y_actual' : y_test,
             'y_predicted' : y_pred})

In [121]:
mismatch_df = comparison_df[comparison_df['y_actual'] != comparison_df['y_predicted']]

In [122]:
mismatch2_df = pd.concat([mismatch_df, X_test], axis = 1)

In [123]:
# All incorrect predictions with titles

mismatches = mismatch2_df.dropna()

In [124]:
mismatches

Unnamed: 0,y_actual,y_predicted,title
t3_i5hs5b,nba,nfl,"[ walden ] jazz starting lineup : ed davis , georges niang , joe ingles , miye oni , emmanuel mudiay"
t3_i6oud4,nfl,nba,[ highlight ] ball sticks the landing during rainy mnf game . dolphins vs steelers
t3_i3l4fu,nfl,nba,"jack del rio keeps opinion on nfl players opting out to himself "" i have personal views that would probably not sit well with my professional occupation right now , """
t3_i4dewr,nfl,nba,"[ odegard ] kliff kingsbury admits he staged his draft night photo . "" i do think the fire - - it was 100 degrees and sunny outside - - was a bit much . """
t3_i5m8sf,nba,nfl,which nba superstar ’ s ascension shocked you the most ? unexpected superstars ?
t3_i6u9oa,nba,nfl,toronto raptors is the third team to crack 50 wins in 2020 season .
t3_i5fyoa,nfl,nba,[ highlight ] julian edelman makes ridiculous catch !
t3_i4uy3k,nba,nfl,"kemba , when asked whether he considered the knicks in free agency : "" yes . very serious . very , "" on the ringer pod . "" before boston came along the knicks were one of my top priorities because i was thinking they were going to get another player . but it didn't work out . """
t3_i6mn3e,nfl,nba,who are the most physical cornerbacks currently in the nfl ?
t3_i1h3ou,nfl,nba,bubble not off the table says nfl top doc


### Let's try TF-IDF

Term Frequency / Inverse Document Frequency

TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

IDF(w) = log_e(Total number of documents / Number of documents with term w in it)

In [125]:
tfidf_vec = TfidfVectorizer(stop_words=stop_words,
                            max_features=5000,
                            ngram_range=(1, 3))

In [126]:
train_data_features = tfidf_vec.fit_transform(clean_train_data)

test_data_features = tfidf_vec.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1223, 5000), (408, 5000))

In [127]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [128]:
lr.score(train_data_features, y_train)

0.9811937857726901

In [129]:
lr.score(test_data_features, y_test)

0.8848039215686274

### Let's try on some other subreddits

In [130]:
train = pd.concat([politics_test, conservative_test])

In [131]:
X = train[['title']]
y = train['subreddit']

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [133]:
politics_test = politics_test.drop(columns='selftext')
conservative_test = conservative_test.drop(columns='selftext')

train = pd.concat([politics_test, conservative_test])
# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = TweetTokenizer()

train['title'] = train['title'].map(lambda x: tokenizer.tokenize(x.lower()))
train['title'] = train['title'].map(lambda x: ' '.join(x))

In [134]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)
    
    
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [135]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

vocab = vectorizer.get_feature_names()

### Modeling

In [136]:
lr = LogisticRegression(penalty = 'l2')

In [137]:
train_data_features.shape, y_train.shape

((1193, 5000), (1193,))

In [138]:
lr.fit(train_data_features, y_train)

lr.score(train_data_features, y_train)

0.9731768650461022

In [139]:
lr.score(test_data_features, y_test)

0.7412060301507538

In [140]:
coef_list = lr.coef_.tolist()

coef_list = coef_list[0]

In [144]:
coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df = coef_df.sort_values(by = ['coefs'])
coef_df

Unnamed: 0,features,coefs
3252,police,-1.231824
3370,president trump,-1.082000
4927,woke,-0.995189
790,chicago,-0.983033
4797,want,-0.978882
...,...,...
3970,sheriff,1.062440
3106,pandemic,1.075234
2023,help,1.096944
3669,republican,1.192114


In [145]:
coef_df.iloc[2400:2500]

Unnamed: 0,features,coefs
3736,riots looting hit,-0.024914
2613,magnificent mile riots,-0.024914
2939,nolte magnificent mile,-0.024914
2938,nolte magnificent,-0.024914
1259,democrat run,-0.024914
...,...,...
3260,police involved,-0.014149
2586,looting caught video,-0.014149
4901,widespread looting caught,-0.014149
4712,video police involved,-0.014149


In [146]:
coef_df.tail(20)

Unnamed: 0,features,coefs
2411,kudlow,0.800677
3274,political,0.802059
2816,money,0.805845
1887,gop,0.808102
1604,expected,0.827791
2754,michigan,0.837977
1940,groups,0.848029
4922,wisconsin,0.848693
3373,presidential,0.851468
863,claim,0.868111


In [147]:
coef_df.head(20)

Unnamed: 0,features,coefs
3252,police,-1.231824
3370,president trump,-1.082
4927,woke,-0.995189
790,chicago,-0.983033
4797,want,-0.978882
2980,obama,-0.96957
257,amp,-0.949691
4332,term,-0.912016
274,antifa,-0.909629
4665,use,-0.897142
