## Read before running

- Make a copy of this Colab and save it to your personal folder for the project
- Save 'mbti_allLabledAuthors.csv' to you project folder. You can find it in my project folder which is shared with you.
- make a folder called ‘mbti_users_scrape’ in your project folder
- **DON’T** copy my author_log.csv file, the code will make your own in your project folder
- Change the 'my_dir' variable to match the path to your personal project folder.
- Change the start & end idx variables to match your assigned range (written in a comment below). 
- You should now be able to run the notebook
- If your collar session times out, refer to the author_log file and the last cell in this notebook to find the index of the last author you scraped. That number rounded up to the nearest 100 will be your new starting index

In [None]:
# Mount your google drive so you can read/write files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#install the library to scrape pushshift
!pip install pmaw pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pmaw
  Downloading pmaw-3.0.0-py3-none-any.whl (29 kB)
Collecting praw
  Downloading praw-7.7.0-py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.4/189.4 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.5.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw, pmaw
Successfully installed pmaw-3.0.0 praw-7.7.0 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.5.1


In [None]:
from pmaw import PushshiftAPI
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import unicodedata as ud
from itertools import product
import os

def format_df(df_original, authors_df, author):
  df = df_original.copy()
  # Remove partial matches
  df = df[df['author']== author]
  # Add 'author_flair_text' column to the comment dataframe
  df['author_flair_text'] = authors_df.loc[authors_df['author'] == author, 'author_flair_text'].iloc[0]
  # Keep only relevant columns
  df = df[['author','author_flair_text','body', 'subreddit']]
  # filter out posts in mbti subreddits
  blacklist = [''.join(tup) for tup in list(product(['e', 'i'],['s', 'n'], ['f', 't'],['j', 'p']))]
  blacklist.append('mbti')
  df['subreddit'] = df['subreddit'].apply(lambda x: x.lower())
  for sub in blacklist:
    df = df[df.subreddit != sub]
  # Clean the comment text
  df['body'] = df['body'].apply(clean_comment)
  return df

def clean_comment(comment):
  # to filter for empty comments
  if type(comment) == str:
    # Keep alphanumeric characters, spaces, and common punctuation marks and symbols
    allowed_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    comment = ''.join([c for c in comment if c in allowed_chars])
    # Standardize alphanumeric characters using the unicodedata library
    comment = ''.join([ud.normalize('NFKD', c).encode('ASCII', 'ignore').decode() for c in comment])
  else:
    comment = np.nan
  return comment

######## Path to your personal project folder########
my_dir = '/content/drive/MyDrive/Classes/EECS_448/Project/' # fill in
####################################################

# read authors dataframe
path = os.path.join(my_dir, 'mbti_allLabledAuthors.csv')
authors_df = pd.read_csv(path, usecols=['author', 'author_flair_text'])

# Initialize pushshift api instance
api = PushshiftAPI()

# Define scrape window
start_date = datetime(2022, 11, 3)
end_date = datetime(2023, 3, 21)

# Initialize empty list to store comment dataframes
comment_dfs = []
comment_count = 0

# Define starting index for the scrapping session
# LR:(0-3700), JS:(3701-6200), AS:(6201-8700), JY:(8701-11022)
start_idx = 3701 # based on the author log
end_idx = 6201 # based on numbers above, +1 because of how range() works

# Loop over unique authors in the dataframe
for i in range(start_idx, end_indx):
  # find author from index
  author = authors_df.loc[i].author
  # Use PMAW to search for comments by this author
  comments = api.search_comments(author=author, 
                                  since = int(start_date.timestamp()),
                                  until = int(end_date.timestamp()),
                                  limit=1000,
                                  safe_exit=True
                                  )
  # Some users might have no activity, so we need try/except to avoid erroring out
  try:
    # Convert comment data to a Pandas dataframe
    comment_df = pd.DataFrame(comments)
    # Format df
    comment_df = format_df(comment_df, authors_df, author)
    # Append the comment dataframe to the list
    comment_dfs.append(comment_df)
  except:
    pass
  #Save a file every 100 authors (~20min)
  if i % 100 == 0:
    print(i)
    # Merge df of last 100 authors in to a single df
    comments_df = pd.concat(comment_dfs, ignore_index=True)
    # Update the comment count, note that this number resets evreytime you restart the colab session
    comment_count += len(comments_df.index)
    # Save scraped comments of past 100 authors to csv file
    fname = os.path.join(my_dir,'mbti_users_scrape/authors_{}_{}totalComments.csv'.format(i, comment_count))
    comments_df.to_csv(fname)
    # Make log of past 100 authors scraped
    log = comments_df.drop_duplicates(subset='author')
    # Append usernames of past 100 authors to csv file tracking scraped authors
    log_path=os.path.join(my_dir,'author_log.csv')
    log.to_csv(log_path, mode=('w' if i==0 else 'a'), columns=['author'], header=(True if i==0 else False))
    # empty list storing author dfs
    comment_dfs = []  

In [None]:
# Use this line and the last username in your author_log.csv file to find what your starting index should be
authors_df[authors_df['author']=='Ok-Reporter-196']


Unnamed: 0,author,author_flair_text
1199,Ok-Reporter-196,enfj
