# Import Libraries

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import re
import json
import time

from psaw import PushshiftAPI
import praw


from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import math

%matplotlib inline
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


import warnings
warnings.filterwarnings('ignore')

---

### Webscrapping from subreddit `r/TheOnion` (need not run the code as it has already been exported in `data` folder)

In [228]:
url = 'https://api.pushshift.io/reddit/search/submission'
onion_df = pd.DataFrame()
params_onion = {
    'subreddit':'TheOnion',
    'fields':['subreddit','title','removed_by_category','created_utc',
              'selftext','score','num_comments'],
    'size': 100,
    'before':1635120000
}

while len(onion_df)<10000:
    onion_res = requests.get(url, params_onion)
    if onion_res.status_code==200:
        onion_add_df = pd.DataFrame(onion_res.json()['data'])
        onion_df = onion_df.append(onion_add_df)
        onion_df.drop(onion_df.index[onion_df['removed_by_category'].notnull()], inplace=True)
        params_onion['before'] = onion_add_df['created_utc'].iloc[-1]
        time.sleep(0.01)
        print(len(onion_df))
    else:
        time.sleep(1)

88
166
249
324
390
447
491
493
484
514
537
549
561
604
633
642
665
661
680
557
545
590
594
609
693
735
710
791
891
991
1091
1191
1291
1391
1491
1591
1691
1791
1891
1991
2091
2191
2291
2391
2491
2591
2691
2791
2891
2991
3091
3191
3291
3391
3491
3591
3691
3791
3891
3991
4091
4191
4291
4391
4491
4591
4691
4791
4891
4976
5076
5176
5276
5376
5476
5576
5676
5776
5876
5976
6076
6176
6276
6376
6476
6576
6676
6776
6876
6976
7076
7176
7276
7376
7476
7576
7676
7776
7876
7976
8076
8176
8276
8376
8476
8576
8676
8776
8876
8976
9076
9176
9276
9376
9476
9576
9676
9776
9876
9976
10076


In [232]:
onion_df.to_csv('../data/df_TheOnion_10000.csv')

___

### Webscrapping from subreddit `r/news` (need not run the code as it has already been exported in `data` folder)

In [172]:
import time
start_time = time.perf_counter()

url = 'https://api.pushshift.io/reddit/search/submission'

news_df=pd.DataFrame()

param_news = {
    'subreddit':'news',
    'fields':['subreddit','title','score','selftext','removed_by_category','created_utc', 'num_comments'],
    'size':100,
    'before':1635405550,
}

news_res= requests.get(url,param_news)
data = news_res.json()
posts = data['data']
news_add_df = pd.DataFrame(posts)

news_df = news_df.append(news_add_df[news_add_df['removed_by_category'].isnull()])


last_time = news_df['created_utc'].iloc[-1]

while len(news_df)<10000:
    param_news = {
        'subreddit':'news',
        'fields':['subreddit','title','score','selftext','removed_by_category','created_utc', 'num_comments'],
        'size':100,
        'before': last_time
    }
    
    news_res= requests.get(url,param_news)
    data = news_res.json()
    posts = data['data']
    news_add_df = pd.DataFrame(posts)
    news_df = news_df.append(news_add_df[news_add_df['removed_by_category'].isnull()])

    # changing value of the 'before' key in param_news
    last_time = news_df['created_utc'].iloc[-1]
    time.sleep(0.001)
    
    print(len(news_df))
    
stop_time = time.perf_counter()

print(stop_time - start_time)

news_df


28
47
63
80
99
118
124
133
146
166
192
214
226
236
242
251
264
286
312
332
349
364
374
379
383
391
410
420
433
453
468
477
496
515
526
542
550
555
572
586
599
616
635
647
657
665
685
702
713
729
746
763
787
799
806
815
827
850
871
895
917
939
961
971
976
985
1005
1025
1046
1062
1077
1086
1093
1104
1134
1162
1179
1190
1202
1221
1230
1233
1237
1261
1280
1299
1312
1322
1341
1353
1375
1389
1398
1404
1410
1427
1446
1462
1484
1510
1535
1544
1553
1565
1583
1604
1622
1641
1653
1670
1688
1697
1702
1713
1722
1740
1766
1784
1799
1816
1834
1841
1849
1862
1886
1906
1926
1952
1980
1988
2000
2020
2042
2063
2077
2089
2099
2103
2111
2131
2158
2178
2191
2196
2207
2225
2239
2252
2271
2279
2288
2305
2329
2341
2366
2393
2409
2419
2426
2445
2459
2468
2485
2505
2526
2552
2561
2573
2588
2603
2613
2620
2639
2652
2661
2670
2678
2692
2697
2701
2714
2725
2731
2739
2743
2752
2763
2777
2797
2816
2837
2849
2862
2868
2879
2893
2917
2930
2952
2957
2967
2981
2998
3012
3027
3028
3036
3042
3061
3073
3087
3107
3120
3131
3

Unnamed: 0,created_utc,num_comments,removed_by_category,score,selftext,subreddit,title
17,1635403217,0,,1,,news,"U.S. issues its first passport with an ""X"" gen..."
23,1635402170,0,,1,,news,US issues first passport with 'X' gender marke...
34,1635400799,0,,1,,news,Long Beach school safety officer who shot teen...
35,1635400706,0,,1,,news,Submission filed with International Criminal C...
42,1635400246,0,,1,,news,Woman sues Kellogg's for $5 million over Frost...
44,1635399892,0,,1,,news,Google rolls out tool to help minors delete ph...
46,1635399288,0,,1,,news,Facebook has known it has a human trafficking ...
69,1635395964,0,,1,,news,U.S. issues first passport with 'X' gender marker
74,1635394907,1,,1,,news,State of Florida Reaches Lowest COVID-19 Case ...
76,1635394717,0,,1,,news,Flight from New York to Santa Ana diverted aft...


In [None]:
news_df.to_csv("../data/df_news_10000.csv")