In [None]:
!pip install praw psaw pmaw textblob emoji langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting praw
  Downloading praw-7.6.0-py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 4.2 MB/s 
[?25hCollecting asyncpraw
  Downloading asyncpraw-7.5.0-py3-none-any.whl (183 kB)
[K     |████████████████████████████████| 183 kB 51.1 MB/s 
[?25hCollecting psaw
  Downloading psaw-0.1.0-py3-none-any.whl (15 kB)
Collecting pmaw
  Downloading pmaw-2.1.3-py3-none-any.whl (25 kB)
Collecting emoji
  Downloading emoji-2.1.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 64.8 MB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 42.8 MB/s 
[?25hCollecting websocket-client>=0.54.0
  Downloading websocket_client-1.4.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.8 MB/s 
[?25hCollecting update-checker>=0.18
  Downloading update_checker-0.18.0

In [None]:
import praw
import pandas as pd
from psaw import PushshiftAPI
import math
import json
import requests
import itertools
import numpy as np
import time
import datetime as dt
import regex
from textblob import TextBlob
import langdetect
from langdetect import detect, DetectorFactory
import emoji
import warnings
warnings.filterwarnings('ignore')
DetectorFactory.seed = 0

# Crawl data from r/finance

In [None]:
def is_english(text: str) -> bool:
    '''
    Tries to recognize language of a string. If it's english, retruns True, otherwise False.
    '''
    try:
        if detect(text) == "en":
            return True
    except langdetect.lang_detect_exception.LangDetectException:
        # print("Language detection failed, skipping")
        pass
    return False


def contains_emoji(text: str) -> bool:
    '''
    Returns true if string contains an emoji
    '''
    data = regex.findall(r'\X', text)
    for word in data:
        if any(emoji.is_emoji(char) for char in word):
            return True

    return False

In [None]:
def scrape(api, subreddits, start_time, end_time):
  results = list(api.search_submissions(after=start_time, before=end_time, subreddit=subreddits, num_comments=">1"))
  submissions = []
  for res in results:
    try:
      if "Weekly Questions Thread" in res.title or res.selftext or res.selftext == "[removed]":
        continue
      # Filter out titles with emojis as that is probably a spam
      elif contains_emoji(res.title) or not is_english(res.title):
            continue
      elif hasattr(res, 'removed_by_category'):
        continue
      else:
          temp = {
              'id': res.id,
              'title': res.title,
              'score': res.score,
              'external_url': res.url,
              'author': res.author,
              'submitted_time': res.created_utc,
              # 'post_url': 'reddit.com/' + res.id
          }
          submissions.append(temp)
    except Exception as e:
      print(e)
      print(res)
  return submissions

In [None]:
api = PushshiftAPI()
subreddits = ['FinanceNews', 'Economics', 'SecurityAnalysis', 'finance', 'business', 'econmonitor']

start_time = int(dt.datetime(2019, 1, 1).timestamp())
end_time = int(dt.datetime(2022, 10, 14).timestamp())


In [None]:
submissions = scrape(api, subreddits, start_time, end_time)

'submission' object has no attribute 'selftext'
submission(all_awardings=[], allow_live_comments=False, author='[deleted]', author_flair_background_color='', author_flair_css_class=None, author_flair_text=None, author_flair_text_color='dark', awarders=[], banned_by='moderators', can_mod_post=False, contest_mode=False, created_utc=1624077284, domain='self.business', full_link='https://www.reddit.com/r/business/comments/o3881i/welcome_to_broperty_indonesian_property_trading/', gildings={}, id='o3881i', is_created_from_ads_ui=False, is_crosspostable=False, is_meta=False, is_original_content=False, is_reddit_media_domain=False, is_robot_indexable=False, is_self=True, is_video=False, link_flair_background_color='', link_flair_richtext=[], link_flair_text_color='dark', link_flair_type='text', locked=False, media_only=False, no_follow=True, num_comments=2, num_crossposts=0, over_18=False, parent_whitelist_status='all_ads', permalink='/r/business/comments/o3881i/welcome_to_broperty_indonesian_

In [None]:

finances_submissions = pd.DataFrame(submissions)
finances_submissions.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665502151
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,1665170730
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665058536
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,1664974592
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,1664899124


In [None]:
finances_submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28599 entries, 0 to 28598
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              28599 non-null  object
 1   title           28599 non-null  object
 2   score           28599 non-null  int64 
 3   external_url    28599 non-null  object
 4   author          28599 non-null  object
 5   submitted_time  28599 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.3+ MB


In [None]:
name = '_'.join(subreddits)
finances_submissions.to_csv(f'finances_submissions{name}.csv', index=False)  

# Data preprocessing

In [None]:
finances_submissions.iloc[0]

id                                                           qksikv
title             EV startup Rivian could be worth nearly as muc...
score                                                             2
external_url      https://finance.yahoo.com/news/ev-startup-rivi...
author                                                    Albythere
submitted_time                                           1635814605
Name: 0, dtype: object

In [None]:
# remove duplicated url 
no_dup_df = finances_submissions.drop_duplicates(subset=['external_url'], keep=False)
no_dup_df.head()

Unnamed: 0,id,title,score,external_url,author,submitted_time
0,y1bzjo,A New Silicon Valley Emerges at the Arctic Circle,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665502151
1,xy7uim,US to boost gas exports to Germany under deal ...,1,https://www.ft.com/content/064437a4-2c60-4962-...,Soupjoe5,1665170730
2,xx3ryr,‘Uninvestable’ UK Market Lost £300 Billion in ...,1,https://www.bloomberg.com/news/articles/2022-1...,Soupjoe5,1665058536
3,xw9y55,German exports beat expectations despite cooli...,1,https://www.reuters.com/markets/europe/german-...,Soupjoe5,1664974592
4,xviszy,French start-ups to become ‘European champions...,1,https://www.euractiv.com/section/digital/news/...,Soupjoe5,1664899124


In [None]:
no_dup_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26926 entries, 0 to 28598
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26926 non-null  object
 1   title           26926 non-null  object
 2   score           26926 non-null  int64 
 3   external_url    26926 non-null  object
 4   author          26926 non-null  object
 5   submitted_time  26926 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.4+ MB


In [None]:
no_dup_df[no_dup_df['score']>1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7068 entries, 41 to 28115
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              7068 non-null   object
 1   title           7068 non-null   object
 2   score           7068 non-null   int64 
 3   external_url    7068 non-null   object
 4   author          7068 non-null   object
 5   submitted_time  7068 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 386.5+ KB
