In [1]:
#Imported all of the modules I would need to utilize.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import time

from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import plot_tree, export_text

In [2]:
# Requested the total number of posts in Subreddit A (CovidVaccinated) from Reddit's Application Programming Interface.
"https://api.pushshift.io/reddit/search/submission?subreddit=CovidVaccinated&metadata=true&size=0"

'https://api.pushshift.io/reddit/search/submission?subreddit=CovidVaccinated&metadata=true&size=0'

In [3]:
# Requested the total number of self-text posts in Subreddit A (CovidVaccinated) from Reddit's Application Programming Interface.
"https://api.pushshift.io/reddit/search/submission/?subreddit=CovidVaccinated&metadata=true&size=0&is_self=true"

'https://api.pushshift.io/reddit/search/submission/?subreddit=CovidVaccinated&metadata=true&size=0&is_self=true'

In [4]:
# Requested the total number of posts in Subreddit B (DebateVaccines) from Reddit's Application Programming Interface.
"https://api.pushshift.io/reddit/search/submission?subreddit=DebateVaccines&metadata=true&size=0"

'https://api.pushshift.io/reddit/search/submission?subreddit=DebateVaccines&metadata=true&size=0'

In [5]:
# Requested the total number of self-text posts in Subreddit B (DebateVaccines) from Reddit's Application Programming Interface.
"https://api.pushshift.io/reddit/search/submission/?subreddit=DebateVaccines&metadata=true&size=0&is_self=true"

'https://api.pushshift.io/reddit/search/submission/?subreddit=DebateVaccines&metadata=true&size=0&is_self=true'

In [6]:
# Created a variable named url and initializing it to include address of Reddit's API endpoint for searching submissions.
url = 'https://api.pushshift.io/reddit/search/submission'

In [19]:
# Obtained code from fellow student Tamara regarding time.sleep() method
# Created function named <get_posts> which has two hyperparameters: subreddit and utc.  It then scrapes the specified post count (7000) from the specified subreddit from the specified utc backwards before creating a DataFrame with the results.
def get_posts(subreddit, utc):
    post_count = 7000
    params = {'subreddit' : subreddit,
              'size' : 100,
              'is_self' : True,
              'before' : utc
    }
    res = requests.get(url, params)
    data = res.json()
    posts = data['data']
    new_utc = posts[-1]['created_utc']
    posts_df = pd.DataFrame(posts)
    i = 1
    
    
    while i < (post_count/100):
        params = {'subreddit' : subreddit,
              'size' : 100,
              'is_self' : True,
              'before' : new_utc
        }
        res = requests.get(url, params)
        data_1 = res.json()
        posts_1 = data_1['data']
        new_utc = posts_1[-1]['created_utc']
        posts_df_1 = pd.DataFrame(posts_1)
        posts_df = pd.concat([posts_df, posts_df_1])
        i += 1
        time.sleep(12)
    return posts_df

In [20]:
# Called the get_posts function with Subreddit A and UTC from Thursday, April 28, 2022.
subreddit_a_part_one = get_posts('CovidVaccinated', 1651177777)

In [21]:
# Verified that the result of calling the function was of type DataFrame.
type(subreddit_a_part_one)

pandas.core.frame.DataFrame

In [22]:
# Looked at the shape of the DataFrame created by attempting to scrape 7000 posts from Subreddit A.  There were 73 columns, one of which was 'title'.
subreddit_a_part_one.shape

(6990, 73)

In [23]:
# Obtained the sum of duplicate titles in the DataFrame.
subreddit_a_part_one['title'].duplicated().sum()

255

In [24]:
# Dropped the 255 duplicate titles from the DataFrame.
subreddit_a_part_one = subreddit_a_part_one.drop_duplicates(subset=['title'], keep = 'last')

In [25]:
# Looked at the new shape of the DataFrame after having dropped those duplicates.
subreddit_a_part_one.shape

(6735, 73)

In [26]:
# Confirmed that the only value of the 'subreddit' column in this DataFrame was "CovidVaccinated".
subreddit_a_part_one['subreddit'].unique()

array(['CovidVaccinated'], dtype=object)

In [29]:
# Saved this duplicate-free DataFrame to a .csv file.
subreddit_a_part_one.to_csv('../data/subreddit_a.csv', index=False)

In [30]:
# Called the get_posts function with Subreddit B and UTC from Thursday, April 28, 2022 (the same one utilized for pulling posts from Subreddit A).
subreddit_b_part_one = get_posts('DebateVaccines', 1651177777)

In [31]:
# Verified that the result of calling the function was of type DataFrame.
type(subreddit_b_part_one)

pandas.core.frame.DataFrame

In [32]:
# Looked at the shape of the DataFrame created by attempting to scrape 7000 posts from Subreddit B.  There were 88 columns, one of which was 'title'.
subreddit_b_part_one.shape

(6968, 88)

In [33]:
# Obtained the sum of duplicate titles in the DataFrame.
subreddit_b_part_one['title'].duplicated().sum()

136

In [34]:
# Dropped the 136 duplicate titles from the DataFrame.
subreddit_b_part_one = subreddit_b_part_one.drop_duplicates(subset=['title'], keep = 'last')

In [35]:
# Before concatenating the two DataFrames, I needed to discover which columns were only present in one DataFrame and not the other so that I could then drop these columns.
set(subreddit_b_part_one.columns) - set(subreddit_a_part_one.columns)

{'author_created_utc',
 'author_flair_template_id',
 'author_id',
 'brand_safe',
 'distinguished',
 'gilded',
 'media_embed',
 'og_description',
 'og_title',
 'poll_data',
 'removed_by',
 'rte_mode',
 'secure_media_embed',
 'steward_reports',
 'suggested_sort',
 'updated_utc'}

In [36]:
# Dropped the columns that were only present in one DataFrame and not the other.
subreddit_b_part_one = subreddit_b_part_one.drop(columns=[
    'author_created_utc',
    'author_flair_template_id',
    'author_id',
    'brand_safe',
    'distinguished',
    'gilded',
    'media_embed',
    'og_description',
    'og_title',
    'poll_data',
    'removed_by',
    'rte_mode',
    'secure_media_embed',
    'steward_reports',
    'suggested_sort',
    'updated_utc'])

In [37]:
# Confirmed that after dropping those columns, both DataFrames contained the same columns.
set(subreddit_b_part_one.columns) - set(subreddit_a_part_one.columns)

set()

In [38]:
# Confirmed that the only value of the 'subreddit' column in this DataFrame was "DebateVaccines".
subreddit_b_part_one['subreddit'].unique()

array(['DebateVaccines'], dtype=object)

In [39]:
# Saved this duplicate-free DataFrame to a .csv file.
subreddit_b_part_one.to_csv('../data/subreddit_b.csv', index=False)