In [1]:
#install the library to scrape pushshift
!pip install pmaw pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pmaw
  Downloading pmaw-3.0.0-py3-none-any.whl (29 kB)
Collecting praw
  Downloading praw-7.7.0-py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.4/189.4 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update-checker, prawcore, praw, pmaw
Successfully installed pmaw-3.0.0 praw-7.7.0 prawcore-2.3.0 update-checker-0.18.0


In [2]:
import pandas as pd
from pmaw import PushshiftAPI
from datetime import datetime, timedelta
import numpy as np
from itertools import product
import unicodedata as ud

# Define subreddit to scrape
sub = 'Chronicpain'

# Define scrape window & time delta
start_date = datetime(2022, 11, 3)
end_date = datetime(2023, 4, 3)
delta = 1 #weeks

# Initialize pushshift api instance
api = PushshiftAPI()

# Scrape
dfs = []  # to store the results of each api call

#Scrapping sub in 1-week-long time windows
current_date = start_date
while current_date <= end_date:
  since = int(current_date.timestamp())  # start date
  until = int((current_date + timedelta(weeks=delta)).timestamp())      # end date
  print(current_date.strftime('%d %b %Y'))
  # Only scrapping 1000 comments per call because api breaks for >1000
  comments = api.search_comments(         
                      subreddit=sub,
                      since = since,
                      until = until,
                      limit=1000,
                      fields=[['author','author_fullname','subreddit','subreddit_id','id','body','author_flair_text', 'created_utc']],
                      safe_exit=True
                      )
  # Some days might have no user activity, so we need try/catch
  try:
    # Place results in dataframe
    df = pd.DataFrame(comments)
    # Keep only relevant info
    df = df[['subreddit','subreddit_id','author','author_fullname','body', 'created_utc','id']]
    # Save to CSV file
    fname = '/content/drive/MyDrive/Classes/EECS_448/Project/pain_scrape/{}_{}_{}_{}.csv'.format(sub, str(since), str(until), len(df.index))
    df.to_csv(fname)
    dfs.append(df)
  except:
    print('Exception raised. r/{}, search date: {}'.format(sub, since))
  # Increase the data tracker
  current_date += timedelta(weeks=delta)

all_df = pd.concat(dfs, ignore_index=True)

03 Nov 2022
10 Nov 2022
17 Nov 2022
24 Nov 2022
01 Dec 2022
08 Dec 2022
15 Dec 2022
22 Dec 2022
29 Dec 2022
05 Jan 2023
12 Jan 2023
19 Jan 2023
26 Jan 2023
02 Feb 2023
09 Feb 2023
16 Feb 2023
23 Feb 2023
02 Mar 2023
09 Mar 2023
16 Mar 2023
23 Mar 2023
30 Mar 2023


In [3]:
# save dataframe of entire scrape to csv
fname = '/content/drive/MyDrive/Classes/EECS_448/Project/pain_fullScrape.csv'
all_df.to_csv(fname)

# make authors dataframe
authors_df = all_df[['author']]
authors_df = authors_df.drop_duplicates()

# save authors dataframe to csv
fname = '/content/drive/MyDrive/Classes/EECS_448/Project/pain_allLabledAuthors.csv'
authors_df.to_csv(fname)
authors_df.head()

print(len(authors_df))

5202


### Playground / Graveyard

In [None]:
import pandas as pd
from pmaw import PushshiftAPI
import datetime as dt
import numpy as np
from itertools import product
import unicodedata as ud

def format_df(df_original):
  # Keep only relevant columns
  df = df_original.copy()
  df = df[['subreddit','subreddit_id','author','author_fullname','author_flair_text','body', 'created_utc','id']]
  # Clean the flair text
  df['author_flair_text'] = df['author_flair_text'].apply(clean_flair)
  # Keep only rows with flair
  df = df[df['author_flair_text'].notna()]
  return df

def clean_flair(flair):
  if type(flair) == str:
    mbti_types = [''.join(tup) for tup in list(product(['e', 'i'],['s', 'n'], ['f', 't'],['j', 'p']))]
    flair = ''.join(e for e in flair if e.isalpha()) #strip non-alphabetical characters
    flair = ud.normalize('NFKD', flair) #normalize unicode representation
    flair = flair.lower() #make lowercase
    try:
      # Extract the mbti type from the flair string by:
      # (1) Find occurences of mbti types in flair and store the index of the first occurence
      # example: 'anesfpmarriedtointp' returns 2, the index of the character 'e'
      idx = min(i for i in [flair.find(x) for x in mbti_types] if i >= 0)
      #(2) make the flair be the first mbti occurence, example above returns 'esfp'
      flair = flair[idx:idx+4]
    except:
      # No match was found, so not useful
      flair = np.nan
  else:
    flair = np.nan
  return flair

# Make a list w all subreddit name (ie mbti types)
subreddits = [''.join(tup) for tup in list(product(['e', 'i'],['s', 'n'], ['f', 't'],['j', 'p']))]

# define lists with years and months used to define scrapping time window
years = [2022, 2022, 2023, 2023, 2023]
months = [11, 12, 1, 2, 3]

# Initialize pushshift api instance
api = PushshiftAPI()

# Scrape
dfs = []  # to store the results of each api call

for sub in subreddits:
  #Scrapping each sub in 1-month-long time windows
  for i in range(len(years)-1):
    start_year, end_year, start_month, end_month = years[i], years[i+1], months[i], months[i+1]
    since = int(dt.datetime(start_year, start_month, 3,0,0).timestamp())  # start date
    until = int(dt.datetime(end_year, end_month, 2,0,0).timestamp())      # end date
    # Only scrapping 1000 comments per call because api breaks for >1000
    comments = api.search_comments(         
                        subreddit=sub,
                        since = since,
                        until = until,
                        limit=1000,
                        fields=[['author','author_fullname','subreddit','subreddit_id','id','body','author_flair_text', 'created_utc']],
                        safe_exit=True
                        )
    # Place results in dataframe
    df = pd.DataFrame(comments)
    # Format the results
    df = format_df(df)
    # Save to CSV file
    fname = '/content/drive/MyDrive/Classes/EECS_448/Project/scrapping_files/{}_{}_{}_{}.csv'.format(sub, str(since), str(until), len(df.index))
    df.to_csv(fname)
    dfs.append(df)

all_df = pd.concat(dfs, ignore_index=True)

In [None]:
# all_df.columns
# test = (all_df.author_flair_css_class.to_numpy())

# np.unique(all_df.author_fullname.to_numpy()).shape
# all_df.subreddit.unique()
# all_df.apply(lambda x: x.astype(str).str.lower()).subreddit.unique()
# all_df.head()

(5398,)

In [None]:
authors_df = all_df[['author','author_flair_text']]
authors_df = authors_df.drop_duplicates()
authors_df.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     9,    12,
               16,
            ...
            28216, 28218, 28220, 28224, 28228, 28229, 28231, 28232, 28236,
            28239],
           dtype='int64', length=5504)

In [None]:
# api = PushshiftAPI()
# dfs = []
# for sub in subreddits:
#   for i in range(len(years)-1):
#     start_year, end_year, start_month, end_month = years[i], years[i+1], months[i], months[i+1]
#     since = int(dt.datetime(start_year, start_month, 3,0,0).timestamp())
#     until = int(dt.datetime(end_year, end_month, 2,0,0).timestamp())
#     comments = api.search_comments(         
#                         subreddit=sub,
#                         since = since,
#                         until = until,
#                         limit=1000,
#                         safe_exit=True
#                         )
#     df = pd.DataFrame(comments)
#     fname = '/content/drive/MyDrive/Classes/EECS_448/Project/scrapping_files/{}_{}_{}_{}.csv'.format(sub, str(since), str(until), len(comments))
#     df.to_csv(fname)
#     dfs.append(df)
