In [None]:
import re
import time

import numpy as np
import pandas as pd
import requests

In [None]:
!pip3 install progressbar2
from progressbar import progressbar

## Scraping

### Scraping Functions

In [None]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

def scraper_bike(url):
    headers = {'User-Agent' : 'override this bad boy!'}
    posts = []
    after = {}

    for page in progressbar(range(40)):
        params = {'after': after}
        pagepull = requests.get(url=url, params=params, headers=headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        time.sleep(.2)
        
    return posts

In [None]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)


def posts_to_df(post_list):
    i = 0
    post_dict = {}

    for post in post_list:
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]
        i += 1

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext']

    return df_name

In [None]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

## Run Scrape

In [None]:
# You can also put in any 2 subreddits in as the URL and get results for those

nfltest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nfl.json')
nbatest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nba.json')

In [None]:
politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

In [None]:
nbatest.shape

In [None]:
nfltest.head()

In [None]:
nfltest.shape

### Data Cleaning / Preprocessing

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option('max_colwidth', 300)

In [None]:
# drop column

nfltest = nfltest.drop(columns='selftext')
nbatest = nbatest.drop(columns='selftext')

In [None]:
# merge subreddit data

train = pd.concat([nfltest, nbatest])

In [None]:
train

##### Tokenize (grab only word characters)

In [None]:
word_tokenizer = RegexpTokenizer(r'\w+')

In [None]:
print(r'Hello\nWorld')
print('Hello\nWorld')

Word tokenize

In [None]:
train = pd.concat([nfltest, nbatest])
train['title'] = train['title'].map(lambda x: word_tokenizer.tokenize(x.lower()))

In [None]:
train['title'][0:5]

With TweetTokenizer

In [None]:
tknzr = TweetTokenizer()

In [None]:
train['title'] = train['title'].map(lambda x: tknzr.tokenize(x.lower()))

In [None]:
train['title']

In [None]:
# rejoin list of tokenized words into single string for each row

train['title'] = train['title'].map(lambda x: ' '.join(x))

In [None]:
train['title'][0:5]

### Train test split and converting series to list of strings then to array

In [None]:
X = train[['title']]
y = train['subreddit']

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42,
                                                    stratify=y)

In [None]:
# baseline is

y.value_counts(normalize=True)

In [None]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)

In [None]:
len(clean_train_data)

In [None]:
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [None]:
len(clean_test_data)

In [None]:
clean_train_data

### Count Vectorizer

In [None]:
# instantiate our CountVectorizer. This counts the number of appearances of all the words in our training data and
# eliminates common english stop words. 5000 max features works well for our purposes (tested various numbers). Our
# data is already preprocessed and tokenized manually earlier. ngram_range is 1,3, although all or nearly all our
# features are single words

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             max_df= 0.7,
                             min_df= 0.001,
                             ngram_range=(1, 3))

In [None]:
# fit our training data and test data lists to our count_vectorizer

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

In [None]:
train_data_features

In [None]:
# convert to array

train_data_features = train_data_features.toarray()

In [None]:
train_data_features

In [None]:
train_data_features.shape

In [None]:
# check shapes

train_data_features.shape, test_data_features.shape

In [None]:
# I wanted check that the features corpus was as expected - removed print statement for readability

vocab = vectorizer.get_feature_names()

## MODELING

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# fit logistic regression model

lr = LogisticRegression(penalty='l2')

In [None]:
# shape check

train_data_features.shape, y_train.shape

In [None]:
lr.fit(train_data_features, y_train)

In [None]:
train_data_features

In [None]:
lr.score(train_data_features, y_train)

In [None]:
lr.score(test_data_features, y_test)

### Feature comparison

Creates a dataframe that matches features to coefficients

In [None]:
coef_list = lr.coef_.tolist()

In [None]:
coef_list = coef_list[0]

In [None]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

In [None]:
coef_df.sort_values(by = ['coefs'])

### Let's throw out these unfair words and rerun

In [None]:
stopwords = set(stopwords.words('english'))

extra_stopwords = ['nba', 'basketball', 'football', 'nfl']

stopwords.update(extra_stopwords)

In [None]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = stopwords,
                             max_features = 5000,
                             ngram_range = (1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

In [None]:
lr.fit(train_data_features, y_train)

In [None]:
lr.score(train_data_features, y_train)

In [None]:
lr.score(test_data_features, y_test)

In [None]:
coef_list = lr.coef_.tolist()
coef_list = coef_list[0]

coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier()

In [None]:
tree.fit(train_data_features, y_train)

In [None]:
tree.score(train_data_features, y_train)

In [None]:
tree.score(test_data_features, y_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(n_estimators = 100)

In [None]:
forest.fit(train_data_features, y_train)

In [None]:
forest.score(train_data_features, y_train)

In [None]:
forest.score(test_data_features, y_test)

###  Matrix on Logistic Regression

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_pred = lr.predict(test_data_features)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm_df = pd.DataFrame(cm,
                    columns=['predict_neg', 'predict_pos'],
                    index = ['actual_neg', 'actual_pos'])

In [None]:
cm_df

## Checking where our model failed

In [None]:
comparison_df = pd.DataFrame({'y_actual' : y_test,
             'y_predicted' : y_pred})

In [None]:
mismatch_df = comparison_df[comparison_df['y_actual'] != comparison_df['y_predicted']]

In [None]:
mismatch2_df = pd.concat([mismatch_df, X_test], axis = 1)

In [None]:
# All incorrect predictions with titles

mismatches = mismatch2_df.dropna()

In [None]:
mismatches

### Let's try TF-IDF

Term Frequency / Inverse Document Frequency

TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

IDF(w) = log_e(Total number of documents / Number of documents with term w in it)

In [None]:
tfidf_vec = TfidfVectorizer(analyzer="word",
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=['nba', 'nfl', 'football', 'basketball'],
                            max_features=5000,
                            ngram_range=(1, 3))

In [None]:
train_data_features = tfidf_vec.fit_transform(clean_train_data)

test_data_features = tfidf_vec.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

In [None]:
lr.fit(train_data_features, y_train)

In [None]:
lr.score(train_data_features, y_train)

In [None]:
lr.score(test_data_features, y_test)

### Let's try on some other subreddits

In [None]:
train = pd.concat([politics_test, conservative_test])

In [None]:
X = train[['title']]
y = train['subreddit']

In [None]:
# politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
# conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [None]:
politics_test = politics_test.drop(columns='selftext')
conservative_test = conservative_test.drop(columns='selftext')

train = pd.concat([politics_test, conservative_test])
tokenizer = RegexpTokenizer(r'\w+')

train['title'] = train['title'].map(lambda x: tokenizer.tokenize(x.lower()))
train['title'] = train['title'].map(lambda x: ' '.join(x))

In [None]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)
    
    
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [None]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

vocab = vectorizer.get_feature_names()

### Modeling

In [None]:
lr = LogisticRegression(penalty = 'l2')

In [None]:
train_data_features.shape, y_train.shape

In [None]:
lr.fit(train_data_features, y_train)

lr.score(train_data_features, y_train)

In [None]:
lr.score(test_data_features, y_test)

In [None]:
coef_list = lr.coef_.tolist()

coef_list = coef_list[0]

In [None]:
coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])