In [41]:
import requests
import time
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Web scraping 

In [42]:
url = 'https://www.reddit.com/r/economy.json'

In [43]:
headers = {'User-agent': 'Hussam'}

In [44]:
res = requests.get(url, headers = headers)

In [45]:
res.status_code

200

In [46]:
the_json = res.json()

In [47]:
sorted(the_json.keys())

['data', 'kind']

In [48]:
sorted(the_json['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [49]:
the_json['data']['after']

't3_djmfdk'

In [50]:
[post['data']['name'] for post in the_json['data']['children']];

In [51]:
len(the_json['data']['children'])

26

In [52]:
pd.DataFrame(the_json['data']['children']);

In [53]:
param = {'after': 't3_dhofsa'}

In [54]:
requests.get(url,params=param,headers=headers)

<Response [200]>

In [55]:
posts_eco = []
after = None
for i in range(40):
    print(i)
    if after == None:
        params = {}
    else:
        params = {'after':after}
    url = 'https://www.reddit.com/r/economy.json'
    res = requests.get(url, params=params,headers=headers)
    if res.status_code == 200:
        the_json = res.json()
        posts_eco.extend(the_json['data']['children'])
        after = the_json['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [56]:
len(posts_eco)

994

In [57]:
posts_sport = []
after = None
for i in range(80):
    print(i)
    if after == None:
        params = {}
    else:
        params = {'after':after}
    url = 'https://www.reddit.com/r/sports.json'
    res = requests.get(url, params=params,headers=headers)
    if res.status_code == 200:
        the_json = res.json()
        posts_sport.extend(the_json['data']['children'])
        after = the_json['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79


### Do not run above! It will rescrap new data!

In [149]:
len(posts_sport)

1925

### Building the dataframe 

In [150]:
posts_eco_id = [posts_eco[i]['data']['id'] for i in range(len(posts_eco))]

In [151]:
posts_eco_title = [posts_eco[i]['data']['title'] for i in range(len(posts_eco))]

In [152]:
posts_eco_selftext = [posts_eco[i]['data']['selftext'] for i in range(len(posts_eco))]

In [153]:
df_eco = {}
df_eco['id'] = posts_eco_id
df_eco['title'] = posts_eco_title
df_eco['selftext'] = posts_eco_selftext
df_eco['label'] = 0

df_eco = pd.DataFrame(df_eco)
df_eco

Unnamed: 0,id,title,selftext,label
0,b1kvoh,Update,r/economy is for news and discussion of the ec...,0
1,djhjxl,Sallie Mae execs tan at Maui retreat while stu...,,0
2,djr9n3,Ray Dalio says the world is in a 'great sag' a...,,0
3,djlbs1,China's economic growth drops to lowest level ...,,0
4,djco44,The 'Glass Floor' Is Keeping America's Richest...,,0
...,...,...,...,...
989,djc1jw,The IMF warns that 40% of all corporate debt i...,,0
990,djkwjo,Cattle Mystery: Some Readers Have Beef,,0
991,djkmrp,Goldman wants traders to be more like dealmake...,,0
992,djhhy3,US GDP growth in Q3 2019 was 1.8% — Atlanta Fed,,0


In [154]:
posts_sport_id = [posts_sport[i]['data']['id'] 
                  for i in range(len(posts_sport))]

In [155]:
posts_sport_title = [posts_sport[i]['data']['title'] 
                     for i in range(len(posts_sport))]

In [156]:
posts_sport_selftext = [posts_sport[i]['data']['selftext']
                        for i in range(len(posts_sport))]

In [157]:
df_sport = {}
df_sport['id'] = posts_sport_id
df_sport['title'] = posts_sport_title
df_sport['selftext'] = posts_sport_selftext
df_sport['label'] = 1
df_sport = pd.DataFrame(df_sport)
df_sport

Unnamed: 0,id,title,selftext,label
0,djo3v2,Marathon Speed ​​Experience,,1
1,djfmoa,LeBron James pressured Adam Silver to punish R...,,1
2,dj6utr,The Fosbury Flop is a style used in the athlet...,,1
3,djokcs,"Storm Blizzcon 2019, They Can't Censor Us All",,1
4,djrb9j,Ryan Reaves Waits Patiently and Casually block...,,1
...,...,...,...,...
1920,dbe0dt,Sri Lankan team entering Karachi cricket stadium,,1
1921,dbh4l9,18 year old Mahuchikh smashing the high jump W...,,1
1922,dbdbj8,California Governor Signs Plan to Let N.C.A.A....,,1
1923,db0ax4,Panthers Christian McCaffrey makes a spectacul...,,1


In [158]:
df = pd.concat([df_eco,df_sport],ignore_index=False)

In [159]:
df

Unnamed: 0,id,title,selftext,label
0,b1kvoh,Update,r/economy is for news and discussion of the ec...,0
1,djhjxl,Sallie Mae execs tan at Maui retreat while stu...,,0
2,djr9n3,Ray Dalio says the world is in a 'great sag' a...,,0
3,djlbs1,China's economic growth drops to lowest level ...,,0
4,djco44,The 'Glass Floor' Is Keeping America's Richest...,,0
...,...,...,...,...
1920,dbe0dt,Sri Lankan team entering Karachi cricket stadium,,1
1921,dbh4l9,18 year old Mahuchikh smashing the high jump W...,,1
1922,dbdbj8,California Governor Signs Plan to Let N.C.A.A....,,1
1923,db0ax4,Panthers Christian McCaffrey makes a spectacul...,,1


### We save our data to a csv file

In [160]:
#df = pd.read_csv('./reddit_posts.csv',header = None, names=['id','title','selftext','label'])

In [161]:
df.to_csv('./reddit_posts.csv')

In [162]:
df.head()

Unnamed: 0,id,title,selftext,label
0,b1kvoh,Update,r/economy is for news and discussion of the ec...,0
1,djhjxl,Sallie Mae execs tan at Maui retreat while stu...,,0
2,djr9n3,Ray Dalio says the world is in a 'great sag' a...,,0
3,djlbs1,China's economic growth drops to lowest level ...,,0
4,djco44,The 'Glass Floor' Is Keeping America's Richest...,,0


### More data cleaning before modeling

In [163]:
df_eco.shape,df_sport.shape,df.shape

((994, 4), (1925, 4), (2919, 4))

In [164]:
df_eco['title'].nunique(),df_sport['title'].nunique()

(951, 335)

In [165]:
df['title'].nunique()

1286

In [166]:
df_eco['selftext'].nunique()

58

In [167]:
df.columns

Index(['id', 'title', 'selftext', 'label'], dtype='object')

In [168]:
df_e=pd.DataFrame(df_eco['id']+df_eco['title'] + df_eco['selftext'],columns =['a'])
df_e['a'].nunique()

968

In [169]:
df_p=pd.DataFrame(df_sport['id']+df_sport['title'] + df_sport['selftext'],columns =['b'])
df_p['b'].nunique()

335

In [170]:
df_all=pd.DataFrame(df['title'] + df['selftext'],columns =['c'])
df_all['c'].nunique()

1286

In [171]:
949+335

1284

In [172]:
df = df.drop_duplicates()

In [173]:
df = df.reset_index()

In [174]:
df

Unnamed: 0,index,id,title,selftext,label
0,0,b1kvoh,Update,r/economy is for news and discussion of the ec...,0
1,1,djhjxl,Sallie Mae execs tan at Maui retreat while stu...,,0
2,2,djr9n3,Ray Dalio says the world is in a 'great sag' a...,,0
3,3,djlbs1,China's economic growth drops to lowest level ...,,0
4,4,djco44,The 'Glass Floor' Is Keeping America's Richest...,,0
...,...,...,...,...,...
1298,330,d6zeya,High school running back gets flipped over and...,,1
1299,331,d7jh73,Youth sports officials and referees quitting a...,,1
1300,332,d7gpv2,Essex's captain Simon Harmer lifts the Vitalit...,,1
1301,333,d7ahzz,Australia v Fiji (39-21) | Rugby World Cup 2019,,1


In [175]:
df['selftext'].isnull().sum()


0

In [176]:
df['selftext'].nunique()

58

In [177]:
df.drop('index', inplace=True, axis=1)

In [178]:
df.columns

Index(['id', 'title', 'selftext', 'label'], dtype='object')

In [179]:
!cat reddit_posts.csv | top ~10

invalid option or syntax: ~10
top usage: top
		[-a | -d | -e | -c <mode>]
		[-F | -f]
		[-h]
		[-i <interval>]
		[-l <samples>]
		[-ncols <columns>]
		[-o <key>] [-O <secondaryKey>]
			keys: pid (default), command, cpu, cpu_me, cpu_others, csw,
				time, threads, ports, mregion, mem, rprvt, purg, vsize, vprvt,
				kprvt, kshrd, pgrp, ppid, state, uid, wq, faults, cow, user,
				msgsent, msgrecv, sysbsd, sysmach, pageins, boosts, instrs, cycles
		[-R | -r]
		[-S]
		[-s <delay>]
		[-n <nprocs>]
		[-stats <key(s)>]
		[-pid <processid>]
		[-user <username>]
		[-U <username>]
		[-u]

cat: stdout: Broken pipe


### Assigning the X and y variables

In [180]:
X = df['title'] + df['selftext']
y = df['label']

In [181]:
y.value_counts()

0    968
1    335
Name: label, dtype: int64

### Splitting into train and test sets

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                         y, random_state=42, stratify=y)

### Scikit-Learn CounterVectorizer

In [183]:
# Instantiate a CountVectorizer
cvec = CountVectorizer()

In [184]:
# Fit the vectorizer on our corpus
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [185]:
# Transform the corpus.
X_train = cvec.transform(X_train)

In [186]:
# Convert X_train into a DataFrame.

X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=cvec.get_feature_names())
X_train_df

Unnamed: 0,00,000,000m,012,04,05,09,10,100,1000,...,zero,zhiyong,zion,zombie,zone,zte,zuckerberg,zuerlein,zurich,émission
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
973,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
975,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
# Transform test
X_test = cvec.transform(X_test)
X_test_df = pd.DataFrame(X_test.toarray(),
                         columns=cvec.get_feature_names())

### Stop Words

In [188]:
from sklearn.feature_extraction import stop_words

### Vocabulary size

In [189]:
cvec = CountVectorizer(max_features=1000)

### N-Gram Range

In [190]:
cvec = CountVectorizer(ngram_range=(1,2))

### Modeling

In [191]:
# Redefine training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=42)

### Baseline accuracy

In [192]:
y_test.value_counts(normalize=True)

0    0.741688
1    0.258312
Name: label, dtype: float64

In [193]:
# We will set it up with two stages:
# 1) An instance of CountVectorizer (transformer)
# 2) A LogisticRegression instance (estimator)

pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])


### GridSearchCV

In [194]:
# We evaluate how our model will perform on unseen data
cross_val_score(pipe, X_train, y_train, cv=3).mean() 

0.8837719298245613

In [195]:
# We fit our model 
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [196]:
# Training score
pipe.score(X_train, y_train)

0.9989035087719298

In [197]:
# Test score
pipe.score(X_test, y_test)

0.887468030690537

In [198]:
# Search over the following values of hyperparameters:
# Max # of features fit: 2500, 3000, 3500
# Min # of documents needed to include token: 2, 3
# Max # of documents needed to include token: 90%, 95%
# Check (individual tokens) and also check (individual tokens and bigrams).

pipe_params = {
    'cvec__max_features': [1000, 2000, 3000],
    'cvec__min_df': [1, 2, 3],
    'cvec__max_df': [.05, .15, 0.25],
    'cvec__ngram_range': [(1,1), (1,2)]
}

In [199]:
# Instantiate GridSearchCV.

gs = GridSearchCV(pipe, # what object are we optimizing?
                  param_grid=pipe_params, # what parameters values are we searching?
                  cv=3) # 3-fold cross-validation.

In [200]:
# Fit GridSearch to training data.
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cvec__max_features': [1000, 2000, 3000], 'cvec__min_df': [1, 2, 3], 'cvec__max_df': [0.05, 0.15, 0.25], 'cvec__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [201]:
# Best score
gs.best_score_

0.8903508771929824

In [209]:
# save best model
gs_model = gs.best_estimator_

In [212]:
gs_model

Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.25, max_features=2000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        stri...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [213]:
# lr = gs_model['lr']
# coef = lr.coef_
# cvec=gs_model['cvec']
# words=cvec.get_feature_names()
# df_coef = pd.DataFrame({'words':words,'coef':coef})
# df_coef.head()

In [214]:
# score model on training set
gs_model.score(X_train, y_train)

0.9890350877192983

In [215]:
# score model on test set
gs_model.score(X_test, y_test)

0.8951406649616368

### Modeling using TfidfVectorizer

In [216]:
# Fit the transformer.
tvec = TfidfVectorizer()

In [217]:
df = pd.DataFrame(tvec.fit_transform(X_train).toarray(),
                  columns=tvec.get_feature_names())

In [218]:
X_train = tvec.fit_transform(X_train)

X_test = tvec.transform(X_test)

In [219]:
# Instantiate logistic regression.
lr = LogisticRegression()

# Fit logistic regression.
lr.fit(X_train, y_train)

# Evaluate logistic regression.
print(f'Training Score: {lr.score(X_train, y_train)}')
print(f'Testing Score: {lr.score(X_test, y_test)}')

Training Score: 0.8760964912280702
Testing Score: 0.8312020460358056


### Decision Tree Classifier 

In [226]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

In [227]:
# Instantiate `DecisionTreeClassifier` object.
tree = DecisionTreeClassifier(random_state=42)

In [228]:
# Fit and score on the training data.
tree.fit(X_train, y_train)
tree.score(X_train, y_train)

1.0

In [229]:
# Score on the testing data.
tree.score(X_test, y_test)

0.8388746803069054

In [230]:
tree.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])