In [1]:
!pip install praw psaw pmaw textblob emoji langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting praw
  Downloading praw-7.6.0-py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 5.2 MB/s 
[?25hCollecting psaw
  Downloading psaw-0.1.0-py3-none-any.whl (15 kB)
Collecting pmaw
  Downloading pmaw-2.1.3-py3-none-any.whl (25 kB)
Collecting emoji
  Downloading emoji-2.1.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 49.8 MB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 44.3 MB/s 
[?25hCollecting websocket-client>=0.54.0
  Downloading websocket_client-1.4.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.3 MB/s 
[?25hCollecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Building wheels for collected packa

In [2]:
import praw
import pandas as pd
from psaw import PushshiftAPI
import math
import json
import requests
import itertools
import numpy as np
import time
import datetime as dt
import regex
from textblob import TextBlob
import langdetect
from langdetect import detect, DetectorFactory
import emoji
import warnings
warnings.filterwarnings('ignore')
DetectorFactory.seed = 0

# Crawl data

In [None]:
api = PushshiftAPI()
subreddits = ['FinanceNews', 'Economics', 'SecurityAnalysis', 'finance', 'business', 'econmonitor']
start_time = int(dt.datetime(2019, 1, 1).timestamp())
end_time = int(dt.datetime(2022, 10, 14).timestamp())


In [3]:
def is_english(text: str) -> bool:
    '''
    Tries to recognize language of a string. If it's english, retruns True, otherwise False.
    '''
    try:
        if detect(text) == "en":
            return True
    except langdetect.lang_detect_exception.LangDetectException:
        # print("Language detection failed, skipping")
        pass
    return False


def contains_emoji(text: str) -> bool:
    '''
    Returns true if string contains an emoji
    '''
    data = regex.findall(r'\X', text)
    for word in data:
        if any(emoji.is_emoji(char) for char in word):
            return True

    return False

In [None]:
def scrape(api, subreddits, start_time, end_time):
  results = list(api.search_submissions(after=start_time, before=end_time, subreddit=subreddits, num_comments=">1"))
  submissions = []
  for res in results:
    try:
      if "Weekly Questions Thread" in res.title or res.selftext or res.selftext == "[removed]":
        continue
      # Filter out titles with emojis as that is probably a spam
      elif contains_emoji(res.title) or not is_english(res.title):
            continue
      elif hasattr(res, 'removed_by_category'):
        continue
      else:
          temp = {
              'id': res.id,
              'title': res.title,
              'score': res.score,
              'external_url': res.url,
              'author': res.author,
              'submitted_time': res.created_utc,
              # 'post_url': 'reddit.com/' + res.id
          }
          submissions.append(temp)
    except Exception as e:
      print(e)
      print(res)
  return submissions

In [None]:
submissions = scrape(api, subreddits, start_time, end_time)

In [None]:
finances_submissions = pd.DataFrame(submissions)
finances_submissions.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665502151
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,1665170730
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665058536
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,1664974592
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,1664899124


In [None]:
finances_submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28599 entries, 0 to 28598
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              28599 non-null  object
 1   title           28599 non-null  object
 2   score           28599 non-null  int64 
 3   external_url    28599 non-null  object
 4   author          28599 non-null  object
 5   submitted_time  28599 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.3+ MB


In [None]:
name = '_'.join(subreddits)
finances_submissions.to_csv(f'finances_submissions{name}.csv', index=False)  

# **Data preprocessing**

In [None]:
# remove duplicated url and title
no_dup_df = finances_submissions.drop_duplicates(subset=['external_url'], keep=False)
no_dup_df = no_dup_df.drop_duplicates(subset=['title'], keep=False)
no_dup_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665502151
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,1665170730
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665058536
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,1664974592
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,1664899124


In [None]:
no_dup_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26396 entries, 0 to 28598
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26396 non-null  object
 1   title           26396 non-null  object
 2   score           26396 non-null  int64 
 3   external_url    26396 non-null  object
 4   author          26396 non-null  object
 5   submitted_time  26396 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.4+ MB


In [91]:
from datetime import datetime
# convert utc to timestamp
def convert_time(created_utc):
    regular_time = datetime.utcfromtimestamp(created_utc) # Time conversion function
    return datetime.date(regular_time)

no_dup_df['submitted_time'] = no_dup_df['submitted_time'].apply(convert_time)

In [93]:
import requests
# remove invalid url type: image, video, reddit
def isInvalidUrl(path):
    if path[-3:] in ['jpg', 'png']:
      return True
    elif path[-1:] == 'jpeg':
      return True
    elif 'reddit.com' in path or 'redd.it' in path:
      return True
    elif 'youtu.be' in path or 'youtube.com' in path:
      return True
    else:
      return False

In [94]:
no_dup_df['isInvalidLink'] = no_dup_df['external_url'].apply(lambda x: isInvalidUrl(x))
no_dup_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time,isImage,isInvalidLink
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-11,False,False
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,2022-10-07,False,False
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-06,False,False
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,2022-10-05,False,False
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,2022-10-04,False,False


In [96]:
no_dup_invalidurl_df = no_dup_df[no_dup_df['isInvalidLink'] == False]
no_dup_invalidurl_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time,isInvalidLink
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-11,False
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,2022-10-07,False
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-06,False
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,2022-10-05,False
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,2022-10-04,False


In [97]:
no_dup_invalidurl_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23919 entries, 0 to 26395
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              23919 non-null  object
 1   title           23919 non-null  object
 2   score           23919 non-null  int64 
 3   external_url    23919 non-null  object
 4   author          23919 non-null  object
 5   submitted_time  23919 non-null  object
 6   isInvalidLink   23919 non-null  bool  
dtypes: bool(1), int64(1), object(5)
memory usage: 1.3+ MB


#### **Get posts with score > 1**

These posts with higher score may contain real/useful news

In [98]:
no_dup_score_df = no_dup_invalidurl_df[no_dup_invalidurl_df['score']>1]
no_dup_score_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6509 entries, 41 to 25927
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              6509 non-null   object
 1   title           6509 non-null   object
 2   score           6509 non-null   int64 
 3   external_url    6509 non-null   object
 4   author          6509 non-null   object
 5   submitted_time  6509 non-null   object
 6   isInvalidLink   6509 non-null   bool  
dtypes: bool(1), int64(1), object(5)
memory usage: 362.3+ KB


In [99]:
len(set(no_dup_score_df['author'].to_list()))

2330

In [100]:
no_dup_score_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time,isInvalidLink
41,wpthbo,Hedge fund Elliott dumps SoftBank stake after ...,2,https://www.ft.com/content/f9d0c388-9cda-4377-...,financialtimes,2022-08-16,False
45,ubsh1k,Twitter accepts Elon Musk’s buyout deal,5,https://www.cnbc.com/2022/04/25/twitter-accept...,jag316,2022-04-25,False
46,u47auc,China’s Key Economic Data to Show Price Paid f...,3,https://www.bloomberg.com/news/articles/2022-0...,Soupjoe5,2022-04-15,False
47,trsrdz,Young women earn more than young men in severa...,63,https://www.pewresearch.org/fact-tank/2022/03/...,9mac,2022-03-29,False
48,trs3ln,There are now a record 5 million more job open...,13,https://www.cnbc.com/2022/03/29/there-are-now-...,HRJafael,2022-03-29,False


In [101]:
# sort posts by time
no_dup_score_df.sort_values(by='submitted_time', inplace=True)

In [102]:
no_dup_score_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time,isInvalidLink
25927,ai3qae,Canadians cry oil crisis as pipeline problem g...,2,https://www.rt.com/business/449249-canadians-c...,r0cketeer88,2019-01-20,False
25763,akfuzc,20% of Americans in relationships are committi...,26,https://finance.yahoo.com/news/20-percent-amer...,Sirskywaves,2019-01-27,False
25765,akeobs,Apple's dismissal of 200 self-driving car empl...,3,https://www.cnbc.com/2019/01/26/apple-car-layo...,michapman2,2019-01-27,False
25764,akewyf,The World Economy Just Can’t Escape Its Low-Gr...,4,https://www.nytimes.com/2019/01/27/upshot/worl...,DoremusJessup,2019-01-27,False
25722,al2kac,Good twitter thread by Michael Mauboussin,2,https://twitter.com/mjmauboussin/status/109029...,intrix,2019-01-29,False


In [105]:
no_dup_score_df.drop(columns='isInvalidLink', inplace = True)

In [106]:
no_dup_score_df.to_csv("cleaned_score1.csv", index=False)

####**Get posts with score = 1**

These posts with low score may contain invalid news link. We will only retrieve the most recent 4k data (sorted by time) and then combine with our previous higher score data.

In [108]:
no_dup_score_less1_df = no_dup_invalidurl_df[no_dup_invalidurl_df['score']<2]
no_dup_score_less1_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time,isInvalidLink
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-11,False
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,2022-10-07,False
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-06,False
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,2022-10-05,False
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,2022-10-04,False


In [109]:
no_dup_score_less1_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17410 entries, 0 to 26395
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              17410 non-null  object
 1   title           17410 non-null  object
 2   score           17410 non-null  int64 
 3   external_url    17410 non-null  object
 4   author          17410 non-null  object
 5   submitted_time  17410 non-null  object
 6   isInvalidLink   17410 non-null  bool  
dtypes: bool(1), int64(1), object(5)
memory usage: 969.1+ KB


In [110]:
no_dup_score_less1_df[:4000]

Unnamed: 0,id,title,score,external_url,author,submitted_time,isInvalidLink
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-11,False
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,2022-10-07,False
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-06,False
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,2022-10-05,False
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,2022-10-04,False
...,...,...,...,...,...,...,...
4551,ok3n4y,Developing nations suffer from extreme freight...,1,https://splash247.com/?p=147080,chinozc,2021-07-14,False
4552,ok3j78,Opinion | Innovation Moves to Middle America,1,https://www.wsj.com/articles/innovation-moves-...,JayG-OK,2021-07-14,False
4553,ok398d,Opinion | Powell Gets His Inflation,1,https://www.wsj.com/articles/powell-gets-his-i...,JayG-OK,2021-07-14,False
4554,ok316h,Why Cuba is having an economic crisis,1,https://noahpinion.substack.com/p/why-cuba-is-...,investorinvestor,2021-07-14,False


Combine 2 dataframe together

In [111]:
combined_df = pd.concat([no_dup_score_less1_df[:4000], no_dup_score_df])

In [112]:
combined_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time,isInvalidLink
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-11,False
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,2022-10-07,False
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-06,False
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,2022-10-05,False
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,2022-10-04,False


In [113]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10509 entries, 0 to 41
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              10509 non-null  object
 1   title           10509 non-null  object
 2   score           10509 non-null  int64 
 3   external_url    10509 non-null  object
 4   author          10509 non-null  object
 5   submitted_time  10509 non-null  object
 6   isInvalidLink   4000 non-null   object
dtypes: int64(1), object(6)
memory usage: 656.8+ KB


In [116]:
combined_df = combined_df[['id', 'title', 'score', 'external_url', 'author', 'submitted_time']]
combined_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-11
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,2022-10-07
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,2022-10-06
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,2022-10-05
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,2022-10-04


In [117]:
combined_df.to_csv('final.csv', index=False)