# Reddit Classifier

<h1>Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Problem-Statement:" data-toc-modified-id="Problem-Statement:-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Problem Statement:</a></span></li><li><span><a href="#Scraping-for-data" data-toc-modified-id="Scraping-for-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Scraping for data</a></span></li><li><span><a href="#Read-the-csv-file" data-toc-modified-id="Read-the-csv-file-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read the csv file</a></span></li><li><span><a href="#Baseline-accuracy" data-toc-modified-id="Baseline-accuracy-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Baseline accuracy</a></span><ul class="toc-item"><li><span><a href="#Our-baseline-accuracy-is-as-above-for-each-class" data-toc-modified-id="Our-baseline-accuracy-is-as-above-for-each-class-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Our baseline accuracy is as above for each class</a></span></li></ul></li><li><span><a href="#NLP-Pipeline" data-toc-modified-id="NLP-Pipeline-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>NLP Pipeline</a></span><ul class="toc-item"><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Train test split</a></span></li><li><span><a href="#Count-Vectorizer-pipeline" data-toc-modified-id="Count-Vectorizer-pipeline-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Count Vectorizer pipeline</a></span></li></ul></li></ul></div>

## Problem Statement:
Scape the Reddit website for two topics, and build a binary classifier that will classify which topic the text originate from

## Scraping for data

In [4]:
import requests
import pandas as pd
import numpy as np

In [48]:
from collections import defaultdict

def extract_content(response, topic):
    # 0th index refers to title appended with selftext
    rslt = defaultdict(list)
    for post in response.json()['data']['children']:
        rslt['content'].append(post['data']['title'] + post['data']['selftext'])
        rslt['topic'].append(topic)
        rslt['id'].append(post['data']['name'])
        
    return rslt

def scrape_topic(topic,depth=12):
    '''
    Scrape the reddit topic for all the posts
    
    '''
    base_url = 'https://www.reddit.com/r/'
    url = base_url + topic + '.json?limit=100'
    last_entry_name = ''
    payload = None
    
    
    for i in range(depth):
        if last_entry_name == None:
            break
        if last_entry_name != '':
            url = base_url + topic + '.json?limit=100&after=' + last_entry_name
    
        res = requests.get(url, headers={'User-agent': 'Pony Inc 1.0'})
        print(f"request url: {url} for {topic} at depth:{i}")
        
        last_entry_name = res.json()['data']['after']
        print(f"Last entry name : {last_entry_name} ")
        content = extract_content(res,topic)
        if not payload:
            payload = content
        else:
            for k,v in payload.items():
                payload[k] += content[k]
        if i < depth-1:
            sleep_duration = random.randint(2,30)
            time.sleep(sleep_duration)
        print(f'Scraping {topic}, {i} pages in..')
    return {'topic': topic, 'data': payload }
    

In [28]:
import concurrent.futures as cf
import random, time


def get_reddit_data(topics):
    '''
    Main function to scrape reddit data into a pandas dataframe.
    Args: topics
     - An array object of topics in reddit to scrape from
    '''
    rdf = pd.DataFrame()
    with cf.ThreadPoolExecutor(max_workers=5) as executor:
        for future in executor.map(scrape_topic, topics):
            if rdf.empty:
                rdf = pd.DataFrame(future['data'])
            else:
                new = pd.DataFrame(future['data'])
                rdf = pd.concat([rdf,new],ignore_index=True)
            
    return rdf
        
      

In [61]:
import os.path

rdf = get_reddit_data(['AskDocs','legaladvice'])
print(rdf.groupby(by='topic').count())

# filename = 'reddit_scrape_bio_chem.csv'
filename = 'reddit_scrape_docs_legal.csv'

use_header=True

if os.path.exists(filename):
    use_header = False

# Save the data into a csv file
rdf.to_csv(filename,mode='a',header=use_header, index=False)

request url: https://www.reddit.com/r/legaladvice.json?limit=100 for legaladvice at depth:0
Last entry name : t3_dkz2et 
request url: https://www.reddit.com/r/AskDocs.json?limit=100 for AskDocs at depth:0
Last entry name : t3_dkv1dd 
Scraping legaladvice, 0 pages in..
request url: https://www.reddit.com/r/legaladvice.json?limit=100&after=t3_dkz2et for legaladvice at depth:1
Last entry name : t3_dklq67 
Scraping legaladvice, 1 pages in..
request url: https://www.reddit.com/r/legaladvice.json?limit=100&after=t3_dklq67 for legaladvice at depth:2
Last entry name : t3_dkqwo4 
Scraping AskDocs, 0 pages in..
request url: https://www.reddit.com/r/AskDocs.json?limit=100&after=t3_dkv1dd for AskDocs at depth:1
Last entry name : t3_dkrf1i 
Scraping legaladvice, 2 pages in..
request url: https://www.reddit.com/r/legaladvice.json?limit=100&after=t3_dkqwo4 for legaladvice at depth:3
Last entry name : t3_dkbfqt 
Scraping AskDocs, 1 pages in..
request url: https://www.reddit.com/r/AskDocs.json?limit=10

## Read the csv file

In [84]:
reddit_df = pd.read_csv(filename)

In [85]:
reddit_df

Unnamed: 0,content,topic,id
0,Weekly Discussion/General Questions Thread - O...,AskDocs,t3_dhnk4v
1,(F18) Hit my head off a nail in the wall repea...,AskDocs,t3_dktyay
2,"[29F] Fell on ribs a month ago, xray showed no...",AskDocs,t3_dl06ul
3,What happened to me last night ?25M\nWeight 73...,AskDocs,t3_dkudly
4,"Back injury advise.26\nMale\n5"" 11\n85kg\nCauc...",AskDocs,t3_dl0nns
...,...,...,...
1755,[FL] In June my wife got a speeding ticket and...,legaladvice,t3_djt05l
1756,Domestic Violence MichiganI’m seeking advice f...,legaladvice,t3_djvyvt
1757,My future ex wife is trying to get our dogs ce...,legaladvice,t3_djblj9
1758,Rented apartment is at 85% humidity. Building ...,legaladvice,t3_djsw8k


In [86]:
# Remove any duplicates if there are any
reddit_df = reddit_df[~reddit_df.duplicated(subset='id')]

In [87]:
# Remove id column
reddit_df = reddit_df[['content','topic']]

In [88]:
# Get the shape of the dataframe
reddit_df.shape

(1760, 2)

In [91]:
# Show data from each class
reddit_df.groupby(by='topic').count()

Unnamed: 0_level_0,content
topic,Unnamed: 1_level_1
AskDocs,770
legaladvice,990


## Baseline accuracy

In [94]:
reddit_df['topic'].value_counts(normalize=True)

legaladvice    0.5625
AskDocs        0.4375
Name: topic, dtype: float64

### Our baseline accuracy is as above for each class

There is a slight imbalance of the observations between the two classes, thus we need to stratify the content when splitting the data during k-fold cross validation.

## NLP Pipeline

In [95]:
# One hot encode the target class
y = reddit_df['topic'].map(lambda x : 1 if x == 'AskDocs' else 0 )

In [97]:
# Create X
X = reddit_df['content']

### Train test split

In [117]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [120]:
print(f'X_train:{X_train.shape} y_train:{y_train.shape} X_test:{X_test.shape} y_test:{y_test.shape}')

X_train:(1320,) y_train:(1320,) X_test:(440,) y_test:(440,)


### Count Vectorizer pipeline

In [109]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer()

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

from sklearn.model_selection import GridSearchCV

In [133]:
cvec_logreg_pipeline = Pipeline(steps=[('cvec', cvec),('logreg',logreg)])
# Show the parameters
# cvec_logreg_pipeline.get_params()

In [134]:
cvec_logreg_params = {'cvec__stop_words':['english'], 'cvec__max_df':[0.7, 1.0],'logreg__solver':['liblinear'],'logreg__C':[1,10] }
cvec_logreg_gscv = GridSearchCV(cvec_logreg_pipeline, cvec_logreg_params, cv=5)
cvec_logreg_gscv.fit(X_train,y_train);

In [135]:
def get_scores(model):
    print(f'Training Score:{model.score(X_train,y_train)}')
    print(f'Testing Score:{model.score(X_test,y_test)}')


In [136]:
get_scores(gs)

Training Score:1.0
Testing Score:0.9772727272727273
