# ![](https://ga-dash.s3.amazonaws.com/production/assets/logo-9f88ae6c9c3871690e33280fcf557f33.png) Project 3: Web APIs & NLP

---

## Data Collection and Cleaning

---

## Problem Statement

Reddit is a website full of many different subreddits. Subreddits are their own forums pertaining to that specific topic, and there are many different ones to look through. After gathering data from two different subreddits, I wanted to see if I could build a binary classification model that could classify text as belonging to either one subreddit or the other. The two subreddits I chose were r/Marvel and r/harrypotter. 

In [1]:
import requests
import pandas as pd
import numpy as np
import time

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

### Testing API Before Building Function

In [485]:
params_1 = {
    'subreddit': 'Marvel',
    'size': 100
}

params_2 = {
    'subreddit': 'harrypotter',
    'size': 100
}

In [486]:
res = requests.get(url, params_1)

res.status_code

data = res.json()

posts = data['data']

df = pd.DataFrame(posts)

df[['subreddit', 'selftext', 'title']]

Unnamed: 0,subreddit,selftext,title
0,Marvel,,Who’s gamora? By me
1,Marvel,,GTA 5 Online: How To Make Mr Knight From MoonK...
2,Marvel,,Am I wrong to make this scene comparison? They...
3,Marvel,,Fid you know Guardians of the galaxy amazing f...
4,Marvel,[removed],"Lets be honest, the moonknight show is pretty ..."
...,...,...,...
95,Marvel,[removed],Premium Pilot?
96,Marvel,,Stan Lee art tribute by remle012
97,Marvel,,The new ASM creative team.
98,Marvel,,Moon Knight Episode 5 Review


## Function for Retrieving Data from Reddit API

In [487]:
def subreddit_data(subreddit, num_posts):
    posts = []
    df_combined = pd.DataFrame()
    num_segments = num_posts//100
    url = 'https://api.pushshift.io/reddit/search/submission'
    start_post = 1650944865
    for i in range(num_segments):
        res = requests.get(url, 
                           params={
                               'subreddit': subreddit,
                               'size': 100,
                               'before': start_post
                           })
        if res.status_code == 200:
            data = res.json()
            posts = data['data']
            df_combined = pd.concat([df_combined, pd.DataFrame(posts)])
            start_post = posts[-1]['created_utc']
            time.sleep(1)
        else:
            print('Error')
   
    return df_combined

### Gathering r/Marvel Data

In [489]:
marvel = subreddit_data('Marvel', 15_000)
marvel

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,media,media_embed,secure_media,secure_media_embed,author_flair_background_color,edited,distinguished,call_to_action,category,banned_by
0,[],False,naranjaPenguin21,,[],,text,t2_3kv7vbzi,False,False,...,,,,,,,,,,
1,[],False,draconetzah,,[],,text,t2_f29uxj3j,False,False,...,,,,,,,,,,
2,[],False,Cosmic-Waldo,,[],,text,t2_4k4uyfid,False,False,...,,,,,,,,,,
3,[],False,monkeytimess,,[],,text,t2_80scqray,False,False,...,,,,,,,,,,
4,[],False,Danny007Boyy,,[],,text,t2_azcns707,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],False,DangerStranger138,,[],,text,t2_4td2aypy,False,False,...,,,,,,,,,,
96,[],False,NewSageTriggrr6,,[],,text,t2_9537yn9a,False,False,...,,,,,,,,,,
97,[],False,Upstairs_Spirit2923,,[],,text,t2_dpvs6fqm,False,False,...,,,,,,,,,,
98,[],False,ZechariyahIII,,[],,text,t2_4klo0rl7,False,False,...,,,,,,,,,,


In [3]:
marvel.reset_index(inplace=True)

NameError: name 'marvel' is not defined

In [491]:
marvel

Unnamed: 0,index,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,...,media,media_embed,secure_media,secure_media_embed,author_flair_background_color,edited,distinguished,call_to_action,category,banned_by
0,0,[],False,naranjaPenguin21,,[],,text,t2_3kv7vbzi,False,...,,,,,,,,,,
1,1,[],False,draconetzah,,[],,text,t2_f29uxj3j,False,...,,,,,,,,,,
2,2,[],False,Cosmic-Waldo,,[],,text,t2_4k4uyfid,False,...,,,,,,,,,,
3,3,[],False,monkeytimess,,[],,text,t2_80scqray,False,...,,,,,,,,,,
4,4,[],False,Danny007Boyy,,[],,text,t2_azcns707,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14993,95,[],False,DangerStranger138,,[],,text,t2_4td2aypy,False,...,,,,,,,,,,
14994,96,[],False,NewSageTriggrr6,,[],,text,t2_9537yn9a,False,...,,,,,,,,,,
14995,97,[],False,Upstairs_Spirit2923,,[],,text,t2_dpvs6fqm,False,...,,,,,,,,,,
14996,98,[],False,ZechariyahIII,,[],,text,t2_4klo0rl7,False,...,,,,,,,,,,


In [5]:
marvel.describe()

Unnamed: 0,index,created_utc,num_comments,num_crossposts,pwls,retrieved_on,score,subreddit_subscribers,thumbnail_height,thumbnail_width,total_awards_received,upvote_ratio,wls,edited,call_to_action,category
count,14998.0,14998.0,14998.0,14998.0,14998.0,14998.0,14998.0,14998.0,8906.0,8906.0,14998.0,14998.0,14998.0,3.0,0.0,0.0
mean,49.493399,1645279000.0,1.282304,0.000467,6.0,1645279000.0,1.023003,1492761.0,121.103638,139.952728,6.7e-05,0.998714,5.980997,1649351000.0,,
std,28.863296,3283884.0,16.946501,0.0216,0.0,3283923.0,0.499721,74347.88,26.321093,1.820746,0.008166,0.021552,0.238013,1311136.0,,
min,0.0,1640059000.0,0.0,0.0,6.0,1640059000.0,0.0,1370626.0,8.0,69.0,0.0,0.25,3.0,1648593000.0,,
25%,24.25,1642367000.0,0.0,0.0,6.0,1642367000.0,1.0,1429712.0,105.0,140.0,0.0,1.0,6.0,1648594000.0,,
50%,49.0,1644966000.0,0.0,0.0,6.0,1644966000.0,1.0,1487680.0,140.0,140.0,0.0,1.0,6.0,1648595000.0,,
75%,74.0,1648325000.0,0.0,0.0,6.0,1648325000.0,1.0,1549716.0,140.0,140.0,0.0,1.0,6.0,1649730000.0,,
max,99.0,1650945000.0,963.0,1.0,6.0,1650945000.0,27.0,1630951.0,140.0,140.0,1.0,1.0,6.0,1650865000.0,,


#### Exporting Data to CSV Before Cleaning

In [492]:
marvel.to_csv('../datasets/marvel.csv', index=False)

### Gathering r/harrypotter Data

In [493]:
h_potter = subreddit_data('harrypotter', 15_000)
h_potter

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,secure_media,secure_media_embed,gallery_data,is_gallery,media_metadata,author_cakeday,distinguished,banned_by,call_to_action,category
0,[],False,Naive_Battle2313,,[],,text,t2_gixqtihx,False,False,...,,,,,,,,,,
1,[],False,OhManTFE,,[],,text,t2_449n0,False,False,...,,,,,,,,,,
2,[],False,bowserx19,,[],,text,t2_3lwgu4j6,False,False,...,,,,,,,,,,
3,[],False,Top-Jelly7934,,[],,text,t2_m5nzsn8r,False,False,...,,,,,,,,,,
4,[],False,rosefilm,,[],,text,t2_1ehz6pqv,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],False,IvoryStrike,GR3,"[{'a': ':Gryff4:', 'e': 'emoji', 'u': 'https:/...",:Gryff4: Gryffindor,richtext,t2_9jeuj,False,False,...,,,,,,,,,,
96,[],False,aetosjayglaza,,[],,text,t2_ej146q1y,False,False,...,,,,,,,,,,
97,[],False,Pristine_Medicine953,,[],,text,t2_9tjc22iq,False,False,...,,,,,,,,,,
98,[],False,MemberOfUniverse,,[],,text,t2_9l8jdgby,False,False,...,,,,,,,,,,


In [494]:
h_potter.reset_index(inplace=True)
h_potter

Unnamed: 0,index,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,...,secure_media,secure_media_embed,gallery_data,is_gallery,media_metadata,author_cakeday,distinguished,banned_by,call_to_action,category
0,0,[],False,Naive_Battle2313,,[],,text,t2_gixqtihx,False,...,,,,,,,,,,
1,1,[],False,OhManTFE,,[],,text,t2_449n0,False,...,,,,,,,,,,
2,2,[],False,bowserx19,,[],,text,t2_3lwgu4j6,False,...,,,,,,,,,,
3,3,[],False,Top-Jelly7934,,[],,text,t2_m5nzsn8r,False,...,,,,,,,,,,
4,4,[],False,rosefilm,,[],,text,t2_1ehz6pqv,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,95,[],False,IvoryStrike,GR3,"[{'a': ':Gryff4:', 'e': 'emoji', 'u': 'https:/...",:Gryff4: Gryffindor,richtext,t2_9jeuj,False,...,,,,,,,,,,
14995,96,[],False,aetosjayglaza,,[],,text,t2_ej146q1y,False,...,,,,,,,,,,
14996,97,[],False,Pristine_Medicine953,,[],,text,t2_9tjc22iq,False,...,,,,,,,,,,
14997,98,[],False,MemberOfUniverse,,[],,text,t2_9l8jdgby,False,...,,,,,,,,,,


#### Exporting Data to CSV Before Cleaning

In [8]:
h_potter.describe()

Unnamed: 0,index,created_utc,num_comments,num_crossposts,pwls,retrieved_on,score,subreddit_subscribers,total_awards_received,upvote_ratio,wls,edited,thumbnail_height,thumbnail_width,call_to_action,category
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,6.0,3411.0,3411.0,0.0,0.0
mean,49.4967,1646123000.0,2.116741,0.0,6.0,1646123000.0,1.034602,1168490.0,6.7e-05,0.997999,5.979599,1648987000.0,120.67253,139.896804,,
std,28.865165,2811829.0,32.531073,0.0,0.0,2811858.0,1.943992,21555.94,0.008165,0.029877,0.24656,943717.0,26.609136,2.693953,,
min,0.0,1641720000.0,0.0,0.0,6.0,1641720000.0,0.0,1132968.0,0.0,0.25,3.0,1648574000.0,12.0,68.0,,
25%,24.5,1643572000.0,0.0,0.0,6.0,1643572000.0,1.0,1152755.0,0.0,1.0,6.0,1648584000.0,105.0,140.0,,
50%,49.0,1645935000.0,0.0,0.0,6.0,1645935000.0,1.0,1166329.0,0.0,1.0,6.0,1648593000.0,140.0,140.0,,
75%,74.0,1648669000.0,0.0,0.0,6.0,1648669000.0,1.0,1183534.0,0.0,1.0,6.0,1648652000.0,140.0,140.0,,
max,99.0,1650945000.0,2787.0,0.0,6.0,1650945000.0,226.0,1211649.0,1.0,1.0,6.0,1650912000.0,140.0,140.0,,


In [495]:
h_potter.to_csv('../datasets/h_potter.csv', index=False)

---

## Data Cleaning

#### Cleaning r/Marvel First

In [604]:
marvel = pd.read_csv('../datasets/marvel.csv')

In [605]:
marvel.head()

Unnamed: 0,index,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,...,media,media_embed,secure_media,secure_media_embed,author_flair_background_color,edited,distinguished,call_to_action,category,banned_by
0,0,[],False,naranjaPenguin21,,[],,text,t2_3kv7vbzi,False,...,,,,,,,,,,
1,1,[],False,draconetzah,,[],,text,t2_f29uxj3j,False,...,,,,,,,,,,
2,2,[],False,Cosmic-Waldo,,[],,text,t2_4k4uyfid,False,...,,,,,,,,,,
3,3,[],False,monkeytimess,,[],,text,t2_80scqray,False,...,,,,,,,,,,
4,4,[],False,Danny007Boyy,,[],,text,t2_azcns707,False,...,,,,,,,,,,


In [606]:
marvel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14998 entries, 0 to 14997
Data columns (total 87 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   index                          14998 non-null  int64  
 1   all_awardings                  14998 non-null  object 
 2   allow_live_comments            14998 non-null  bool   
 3   author                         14998 non-null  object 
 4   author_flair_css_class         868 non-null    object 
 5   author_flair_richtext          14958 non-null  object 
 6   author_flair_text              810 non-null    object 
 7   author_flair_type              14958 non-null  object 
 8   author_fullname                14958 non-null  object 
 9   author_is_blocked              14998 non-null  bool   
 10  author_patreon_flair           14958 non-null  object 
 11  author_premium                 14958 non-null  object 
 12  awarders                       14998 non-null 

In [607]:
marvel.isna().sum()

index                         0
all_awardings                 0
allow_live_comments           0
author                        0
author_flair_css_class    14130
                          ...  
edited                    14995
distinguished             14975
call_to_action            14998
category                  14998
banned_by                 14996
Length: 87, dtype: int64

In [608]:
marvel.columns

Index(['index', 'all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'crosspost_parent',
       'crosspost_parent_list', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked',
       'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls', 'retrieved_on', 'score', 'selftext', 'sen

#### Below I'm trying to get a better look at the columns to decide which ones to drop

In [609]:
marvel.loc[:,['index', 'all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text']]

Unnamed: 0,index,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text
0,0,[],False,naranjaPenguin21,,[],
1,1,[],False,draconetzah,,[],
2,2,[],False,Cosmic-Waldo,,[],
3,3,[],False,monkeytimess,,[],
4,4,[],False,Danny007Boyy,,[],
...,...,...,...,...,...,...,...
14993,95,[],False,DangerStranger138,,[],
14994,96,[],False,NewSageTriggrr6,,[],
14995,97,[],False,Upstairs_Spirit2923,,[],
14996,98,[],False,ZechariyahIII,,[],


In [610]:
marvel.drop(
    columns=['index', 'all_awardings', 'allow_live_comments', 
    'author_flair_css_class', 'author_flair_richtext', 'author_flair_text'],
    inplace=True
)

In [611]:
marvel.loc[:,['author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',]]

Unnamed: 0,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,author_premium,awarders,can_mod_post
0,text,t2_3kv7vbzi,False,False,False,[],False
1,text,t2_f29uxj3j,False,False,False,[],False
2,text,t2_4k4uyfid,False,False,False,[],False
3,text,t2_80scqray,False,False,False,[],False
4,text,t2_azcns707,False,False,False,[],False
...,...,...,...,...,...,...,...
14993,text,t2_4td2aypy,False,False,True,[],False
14994,text,t2_9537yn9a,False,False,False,[],False
14995,text,t2_dpvs6fqm,False,False,True,[],False
14996,text,t2_4klo0rl7,False,False,False,[],False


In [612]:
marvel.drop(
    columns=['author_is_blocked',
    'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post'],
     inplace=True
)

In [613]:
marvel.loc[:,['contest_mode', 'created_utc', 'crosspost_parent',
       'crosspost_parent_list', 'domain', 'full_link', 'gildings', 'id',]]

Unnamed: 0,contest_mode,created_utc,crosspost_parent,crosspost_parent_list,domain,full_link,gildings,id
0,False,1650944643,t3_uc2uy0,"[{'all_awardings': [], 'allow_live_comments': ...",i.redd.it,https://www.reddit.com/r/Marvel/comments/uc3kp...,{},uc3kpq
1,False,1650944330,,,i.redd.it,https://www.reddit.com/r/Marvel/comments/uc3hb...,{},uc3hbk
2,False,1650943914,,,i.redd.it,https://www.reddit.com/r/Marvel/comments/uc3cr...,{},uc3crr
3,False,1650943893,,,self.Marvel,https://www.reddit.com/r/Marvel/comments/uc3ci...,{},uc3ciq
4,False,1650943367,,,i.redd.it,https://www.reddit.com/r/Marvel/comments/uc36u...,{},uc36uv
...,...,...,...,...,...,...,...,...
14993,False,1640059646,,,comicbook.com,https://www.reddit.com/r/Marvel/comments/rl5qy...,{},rl5qyo
14994,False,1640059528,,,i.redd.it,https://www.reddit.com/r/Marvel/comments/rl5pq...,{},rl5pq2
14995,False,1640059409,t3_rl5hlk,"[{'all_awardings': [], 'allow_live_comments': ...",self.AskReddit,https://www.reddit.com/r/Marvel/comments/rl5ob...,{},rl5ob7
14996,False,1640059279,,,self.Marvel,https://www.reddit.com/r/Marvel/comments/rl5ms...,{},rl5ms7


In [614]:
marvel.drop(
    columns=['contest_mode', 'created_utc', 'crosspost_parent',
    'crosspost_parent_list', 'domain', 'full_link', 'gildings', 'id'],
    inplace=True
)

In [615]:
marvel.loc[:,['is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',]]

Unnamed: 0,is_created_from_ads_ui,is_crosspostable,is_meta,is_original_content,is_reddit_media_domain,is_robot_indexable
0,False,True,False,False,True,True
1,False,True,False,False,True,True
2,False,True,False,False,True,True
3,False,False,False,False,False,False
4,False,True,False,True,True,True
...,...,...,...,...,...,...
14993,False,False,False,False,False,False
14994,False,False,False,False,True,False
14995,False,False,False,False,False,False
14996,False,False,False,False,False,False


In [616]:
marvel.drop(
    columns=['is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
    'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable'],
    inplace=True
)

In [617]:
marvel.loc[:,['is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked',]]

Unnamed: 0,is_self,is_video,link_flair_background_color,link_flair_css_class,link_flair_richtext,link_flair_template_id,link_flair_text,link_flair_text_color,link_flair_type,locked
0,False,False,#0079d3,movies,"[{'e': 'text', 't': 'Film/Television'}]",e3228ac0-0afa-11e4-8822-12313b0e9a90,Film/Television,light,richtext,False
1,False,False,#0079d3,movies,"[{'e': 'text', 't': 'Film/Television'}]",e3228ac0-0afa-11e4-8822-12313b0e9a90,Film/Television,light,richtext,False
2,False,False,#0079d3,movies,"[{'e': 'text', 't': 'Film/Television'}]",e3228ac0-0afa-11e4-8822-12313b0e9a90,Film/Television,light,richtext,False
3,True,False,#646d73,other,"[{'e': 'text', 't': 'Other'}]",3216a6b6-0afb-11e4-83eb-12313b0e9a90,Other,light,richtext,False
4,False,False,#7193ff,fan,"[{'e': 'text', 't': 'Fan Made'}]",ee0d673e-0afa-11e4-b608-12313b072914,Fan Made,light,richtext,False
...,...,...,...,...,...,...,...,...,...,...
14993,False,False,#cc3600,comics,"[{'e': 'text', 't': 'Comics '}]",4cd2464c-0af9-11e4-adf3-12313b0e95bd,Comics,light,richtext,False
14994,False,False,#ffd635,art,"[{'e': 'text', 't': 'Artwork'}]",eb6943b8-0afa-11e4-b8b5-12313b0d3999,Artwork,light,richtext,False
14995,False,False,#cc3600,comics,"[{'e': 'text', 't': 'Comics '}]",4cd2464c-0af9-11e4-adf3-12313b0e95bd,Comics,light,richtext,False
14996,True,False,#cc3600,comics,"[{'e': 'text', 't': 'Comics '}]",4cd2464c-0af9-11e4-adf3-12313b0e95bd,Comics,light,richtext,False


In [618]:
marvel.drop(
    columns=['is_self', 'link_flair_background_color','link_flair_css_class', 'link_flair_richtext', 
    'link_flair_template_id', 'link_flair_text_color', 'link_flair_type', 'locked'],
    inplace=True
)

In [619]:
marvel.loc[:,['media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint']]

Unnamed: 0,media_only,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,pinned,post_hint
0,False,True,0,0,False,all_ads,/r/Marvel/comments/uc3kpq/tittle_announcement_...,False,image
1,False,True,0,0,False,all_ads,/r/Marvel/comments/uc3hbk/logo_of_the_kraven_t...,False,image
2,False,True,0,0,False,all_ads,/r/Marvel/comments/uc3crr/outofcontext_screens...,False,image
3,False,True,0,0,False,all_ads,/r/Marvel/comments/uc3ciq/ironman_nano_gauntle...,False,
4,False,True,0,0,False,all_ads,/r/Marvel/comments/uc36uv/3_years_of_endgame_w...,False,image
...,...,...,...,...,...,...,...,...,...
14993,False,True,0,0,False,all_ads,/r/Marvel/comments/rl5qyo/marvel_officially_ch...,False,link
14994,False,True,0,0,False,all_ads,/r/Marvel/comments/rl5pq2/whats_up_danger/,False,image
14995,False,True,0,0,False,all_ads,/r/Marvel/comments/rl5ob7/marvel_fans_of_reddi...,False,
14996,False,True,0,0,False,all_ads,/r/Marvel/comments/rl5ms7/was_the_latest_run_o...,False,


In [620]:
marvel.drop(
    columns=['media_only', 'no_follow', 'num_comments', 'num_crossposts', 
    'parent_whitelist_status', 'pinned', 'post_hint'],
    inplace=True
)

In [621]:
marvel.loc[:,['preview', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'thumbnail',]]

Unnamed: 0,preview,pwls,retrieved_on,score,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_subscribers,subreddit_type,thumbnail
0,"{'enabled': True, 'images': [{'id': 'Mmh_vkswD...",6,1650944654,1,,True,False,False,Marvel,t5_2r40o,1630951,public,https://b.thumbs.redditmedia.com/AG6UokqGK28fd...
1,"{'enabled': True, 'images': [{'id': 'xv8K9LRdk...",6,1650944340,1,,False,False,False,Marvel,t5_2r40o,1630938,public,https://b.thumbs.redditmedia.com/V_QvhKwKCSbUt...
2,"{'enabled': True, 'images': [{'id': 'LxiALzOXY...",6,1650943925,1,,True,False,False,Marvel,t5_2r40o,1630923,public,https://b.thumbs.redditmedia.com/QwlFoGri-J_IF...
3,,6,1650943903,1,[removed],True,False,False,Marvel,t5_2r40o,1630922,public,self
4,"{'enabled': True, 'images': [{'id': 'k9QWmEHNj...",6,1650943378,1,,True,False,False,Marvel,t5_2r40o,1630909,public,https://b.thumbs.redditmedia.com/uIvOfHM4cQw3K...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14993,"{'enabled': False, 'images': [{'id': 'Q3AviYw6...",6,1640059657,1,,True,False,False,Marvel,t5_2r40o,1370644,public,https://b.thumbs.redditmedia.com/RtcQLRzxbaul2...
14994,"{'enabled': True, 'images': [{'id': '0E9Vm19Ue...",6,1640059539,1,,True,False,False,Marvel,t5_2r40o,1370638,public,https://b.thumbs.redditmedia.com/Az1iipzyRP1TS...
14995,,6,1640059419,1,,True,False,False,Marvel,t5_2r40o,1370635,public,default
14996,,6,1640059290,1,[removed],True,False,False,Marvel,t5_2r40o,1370629,public,self


In [622]:
marvel.drop(
    columns=['preview', 'pwls', 'retrieved_on', 'score', 'send_replies',
    'spoiler', 'stickied', 'subreddit_id','subreddit_subscribers', 'subreddit_type'],
    inplace=True
)

In [623]:
marvel.loc[:,['thumbnail_height', 'thumbnail_width', 'title', 'total_awards_received',
       'treatment_tags', 'upvote_ratio', 'url', 'url_overridden_by_dest']]

Unnamed: 0,thumbnail_height,thumbnail_width,title,total_awards_received,treatment_tags,upvote_ratio,url,url_overridden_by_dest
0,105.0,140.0,"Tittle announcement for ""El Muerto"" As seen on...",0,[],1.0,https://i.redd.it/2lvuu4dghsv81.jpg,https://i.redd.it/2lvuu4dghsv81.jpg
1,81.0,140.0,Logo of the Kraven The Hunter film,0,[],1.0,https://i.redd.it/z07krcbinsv81.png,https://i.redd.it/z07krcbinsv81.png
2,99.0,140.0,Out-of-context screenshot from X-Men Evolution...,0,[],1.0,https://i.redd.it/rge79ih3msv81.png,https://i.redd.it/rge79ih3msv81.png
3,,,Iron-Man Nano gauntlet is acting weird,0,[],1.0,https://www.reddit.com/r/Marvel/comments/uc3ci...,
4,140.0,140.0,3 years of EndGame. Where does this film rank ...,0,[],1.0,https://i.redd.it/87m0gdmpksv81.jpg,https://i.redd.it/87m0gdmpksv81.jpg
...,...,...,...,...,...,...,...,...
14993,73.0,140.0,Marvel Officially Changes Punisher Logo in New...,0,[],1.0,https://comicbook.com/marvel/news/marvel-new-p...,https://comicbook.com/marvel/news/marvel-new-p...
14994,140.0,140.0,What’s up danger?,0,[],1.0,https://i.redd.it/1xtex9nhlt681.jpg,https://i.redd.it/1xtex9nhlt681.jpg
14995,,,"Marvel fans of reddit, what made your favorite...",0,[],1.0,/r/AskReddit/comments/rl5hlk/marvel_fans_of_re...,/r/AskReddit/comments/rl5hlk/marvel_fans_of_re...
14996,,,Was the latest run on Captain America by Ta Ne...,0,[],1.0,https://www.reddit.com/r/Marvel/comments/rl5ms...,


In [624]:
marvel.drop(
    columns=['thumbnail_height', 'thumbnail_width', 'total_awards_received',
    'treatment_tags', 'upvote_ratio', 'url', 'url_overridden_by_dest'],
    inplace=True
)

In [625]:
marvel.loc[:,['whitelist_status', 'wls', 'removed_by_category', 'gallery_data',
       'is_gallery', 'media_metadata', 'author_cakeday',
       'author_flair_template_id', 'author_flair_text_color', 'media']]

Unnamed: 0,whitelist_status,wls,removed_by_category,gallery_data,is_gallery,media_metadata,author_cakeday,author_flair_template_id,author_flair_text_color,media
0,all_ads,6,,,,,,,,
1,all_ads,6,,,,,,,,
2,all_ads,6,,,,,,,,
3,all_ads,6,automod_filtered,,,,,,,
4,all_ads,6,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
14993,all_ads,6,reddit,,,,,,,
14994,all_ads,6,reddit,,,,,,,
14995,all_ads,6,automod_filtered,,,,,,,
14996,all_ads,6,reddit,,,,,,,


In [626]:
marvel.drop(
    columns=['whitelist_status', 'wls', 'removed_by_category', 'gallery_data',
       'is_gallery', 'media_metadata', 'author_cakeday',
       'author_flair_template_id', 'author_flair_text_color', 'media'],
    inplace=True
)

In [627]:
marvel.loc[:,['media_embed', 'secure_media', 'secure_media_embed',
       'author_flair_background_color', 'edited', 'distinguished',
       'call_to_action', 'category', 'banned_by']]

Unnamed: 0,media_embed,secure_media,secure_media_embed,author_flair_background_color,edited,distinguished,call_to_action,category,banned_by
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
14993,,,,,,,,,
14994,,,,,,,,,
14995,,,,,,,,,
14996,,,,,,,,,


In [628]:
marvel.drop(
    columns=['media_embed', 'secure_media', 'secure_media_embed',
       'author_flair_background_color', 'edited', 'distinguished',
       'call_to_action', 'category', 'banned_by'],
    inplace=True
)

In [629]:
marvel.head()

Unnamed: 0,author,author_flair_type,author_fullname,is_video,link_flair_text,over_18,permalink,selftext,subreddit,thumbnail,title
0,naranjaPenguin21,text,t2_3kv7vbzi,False,Film/Television,False,/r/Marvel/comments/uc3kpq/tittle_announcement_...,,Marvel,https://b.thumbs.redditmedia.com/AG6UokqGK28fd...,"Tittle announcement for ""El Muerto"" As seen on..."
1,draconetzah,text,t2_f29uxj3j,False,Film/Television,False,/r/Marvel/comments/uc3hbk/logo_of_the_kraven_t...,,Marvel,https://b.thumbs.redditmedia.com/V_QvhKwKCSbUt...,Logo of the Kraven The Hunter film
2,Cosmic-Waldo,text,t2_4k4uyfid,False,Film/Television,False,/r/Marvel/comments/uc3crr/outofcontext_screens...,,Marvel,https://b.thumbs.redditmedia.com/QwlFoGri-J_IF...,Out-of-context screenshot from X-Men Evolution...
3,monkeytimess,text,t2_80scqray,False,Other,False,/r/Marvel/comments/uc3ciq/ironman_nano_gauntle...,[removed],Marvel,self,Iron-Man Nano gauntlet is acting weird
4,Danny007Boyy,text,t2_azcns707,False,Fan Made,False,/r/Marvel/comments/uc36uv/3_years_of_endgame_w...,,Marvel,https://b.thumbs.redditmedia.com/uIvOfHM4cQw3K...,3 years of EndGame. Where does this film rank ...


In [630]:
marvel.drop(
    columns=['author', 'author_flair_type', 'author_fullname', 'permalink'],
    inplace=True
)

#### This is the final DF that I decided to work with (in regards to columns).

In [631]:
marvel.head()

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,thumbnail,title
0,False,Film/Television,False,,Marvel,https://b.thumbs.redditmedia.com/AG6UokqGK28fd...,"Tittle announcement for ""El Muerto"" As seen on..."
1,False,Film/Television,False,,Marvel,https://b.thumbs.redditmedia.com/V_QvhKwKCSbUt...,Logo of the Kraven The Hunter film
2,False,Film/Television,False,,Marvel,https://b.thumbs.redditmedia.com/QwlFoGri-J_IF...,Out-of-context screenshot from X-Men Evolution...
3,False,Other,False,[removed],Marvel,self,Iron-Man Nano gauntlet is acting weird
4,False,Fan Made,False,,Marvel,https://b.thumbs.redditmedia.com/uIvOfHM4cQw3K...,3 years of EndGame. Where does this film rank ...


In [632]:
marvel.isna().sum()

is_video              0
link_flair_text       6
over_18               0
selftext           9704
subreddit             0
thumbnail             0
title                 0
dtype: int64

In [633]:
marvel.shape

(14998, 7)

In [635]:
marvel = marvel[marvel['selftext']!='[removed]']

In [636]:
marvel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13238 entries, 0 to 14997
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   is_video         13238 non-null  bool  
 1   link_flair_text  13233 non-null  object
 2   over_18          13238 non-null  bool  
 3   selftext         3534 non-null   object
 4   subreddit        13238 non-null  object
 5   thumbnail        13238 non-null  object
 6   title            13238 non-null  object
dtypes: bool(2), object(5)
memory usage: 646.4+ KB


In [637]:
marvel.drop_duplicates(ignore_index=True, inplace=True)

In [638]:
marvel.shape

(13197, 7)

In [640]:
marvel = marvel[marvel['selftext'] !='[deleted]']

In [641]:
marvel.dropna(inplace=True)

In [642]:
marvel.reset_index(drop=True, inplace=True)

In [643]:
marvel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3491 entries, 0 to 3490
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   is_video         3491 non-null   bool  
 1   link_flair_text  3491 non-null   object
 2   over_18          3491 non-null   bool  
 3   selftext         3491 non-null   object
 4   subreddit        3491 non-null   object
 5   thumbnail        3491 non-null   object
 6   title            3491 non-null   object
dtypes: bool(2), object(5)
memory usage: 143.3+ KB


In [644]:
marvel[marvel['selftext'] == '']

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,thumbnail,title


In [645]:
marvel[marvel['thumbnail'] == 'self']

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,thumbnail,title
1,False,Film/Television,False,"Alright, so I avoided this movie until today, ...",Marvel,self,"Green lantern, not that bad."
2,False,Comics,False,"So put together a Team using only D-Listers, l...",Marvel,self,Create a Team using D-List Heroes!
3,False,Comics,False,"In the comics, Nate Grey has gone out of the w...",Marvel,self,Interesting idea for Nate Grey storyline
4,False,Film/Television,False,After watching shows that depicted the realist...,Marvel,self,If Marvel shows were more mature the consequen...
5,False,Film/Television,False,About Thor taking The hammer in the past when ...,Marvel,self,Q about End Game
...,...,...,...,...,...,...,...
3485,False,Other,False,Recently Marvel did some NFT stuff with No Way...,Marvel,self,Anyone else a bit disgusted with the whole Mar...
3486,False,Film/Television,False,&amp;#x200B;\n\n*Processing img rpwd9u14x5881...*,Marvel,self,Tens of thousands of likes on each video but o...
3487,False,Comics,False,"Furthermore, what is the rationale behind stop...",Marvel,self,Hey what are the betting odds of Alpha Flight ...
3488,False,Comics,False,"After the little mini-series is up, are we loo...",Marvel,self,Is Conan done for a while?


In [646]:
marvel['thumbnail'].value_counts()

self                                                                                3002
spoiler                                                                              395
nsfw                                                                                  10
https://a.thumbs.redditmedia.com/2KpRk1IB8lVFKe0Ew5JFpkp7nI1XX8LKD7xVEHCc9F0.jpg       1
https://b.thumbs.redditmedia.com/ppAWS3BTM4bWAg0QIvzJpVZRo6guzXKq_P0V9VU2q3w.jpg       1
                                                                                    ... 
https://b.thumbs.redditmedia.com/Pr2N3GggPU892ZFq5UDl6g5U-zVEXKhqp5C2hHSw88E.jpg       1
https://a.thumbs.redditmedia.com/4hEnsUwBbqPjSlHZECCI4uBjVGsairc1tcGwfLwaKW4.jpg       1
https://b.thumbs.redditmedia.com/2dzJEfSyktzsn42q_yqQEzU2owDjNre-PE0M_3sfPvc.jpg       1
https://b.thumbs.redditmedia.com/JT8C9yQmvt7Z_DPSUlYbw5AvWzct1jFrU5J-BMolYvA.jpg       1
https://b.thumbs.redditmedia.com/Z1P4ohWEx9vEKkS6ZBjnB1aJA5_BOiJD9Jh-xRL5v_I.jpg       1
Name: thumbnail, Leng

##### After realizing most entries in "thumbnail" column were "self", decided to drop and re-export data.

In [647]:
marvel.drop(columns='thumbnail', inplace=True)

In [648]:
marvel.head()

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,title
0,False,Other,False,So in the venom 2 post credit venom says he re...,Marvel,plot hole
1,False,Film/Television,False,"Alright, so I avoided this movie until today, ...",Marvel,"Green lantern, not that bad."
2,False,Comics,False,"So put together a Team using only D-Listers, l...",Marvel,Create a Team using D-List Heroes!
3,False,Comics,False,"In the comics, Nate Grey has gone out of the w...",Marvel,Interesting idea for Nate Grey storyline
4,False,Film/Television,False,After watching shows that depicted the realist...,Marvel,If Marvel shows were more mature the consequen...


In [649]:
marvel['is_video'].value_counts()

False    3491
Name: is_video, dtype: int64

##### Dropping 'is_video' columns since all values are False.

In [650]:
marvel.drop(columns='is_video', inplace=True)

In [651]:
marvel.head()

Unnamed: 0,link_flair_text,over_18,selftext,subreddit,title
0,Other,False,So in the venom 2 post credit venom says he re...,Marvel,plot hole
1,Film/Television,False,"Alright, so I avoided this movie until today, ...",Marvel,"Green lantern, not that bad."
2,Comics,False,"So put together a Team using only D-Listers, l...",Marvel,Create a Team using D-List Heroes!
3,Comics,False,"In the comics, Nate Grey has gone out of the w...",Marvel,Interesting idea for Nate Grey storyline
4,Film/Television,False,After watching shows that depicted the realist...,Marvel,If Marvel shows were more mature the consequen...


#### Exporting Cleaned DF

In [652]:
marvel.to_csv('../datasets/marvel_cleaned.csv', index=False)

#### Cleaning r/harrypotter Next

In [2]:
h_potter = pd.read_csv('../datasets/h_potter.csv')

##### Removing the same columns that I did with Marvel.

In [654]:
h_potter = h_potter[['is_video', 'link_flair_text', 'over_18', 'selftext', 'subreddit', 'thumbnail', 'title']]
h_potter

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,thumbnail,title
0,False,,False,[removed],harrypotter,self,Check it out
1,False,Discussion,False,Asking for a friend.,harrypotter,self,Could protego stop a nuclear blast?
2,False,Discussion,False,SPOILERS FOR GOBLET OF FIRE\n\n&amp;#x200B;\n\...,harrypotter,self,Goblet of Fire Rant/Discussion Post
3,False,Discussion,False,,harrypotter,self,Is circumcision common in The Wizarding World?
4,False,,False,,harrypotter,self,Anyone else get a kick out the wardrobe for ad...
...,...,...,...,...,...,...,...
14994,False,Discussion,False,"Of all the seven novels, Goblet of Fire is sub...",harrypotter,self,Goblet of Fire is the worst book
14995,False,,False,[removed],harrypotter,self,Which type is Evanna Lynch? (ft. @CognitiveTyp...
14996,False,,False,"*All Harry’s spellbooks, his wand, robes, caul...",harrypotter,self,How could Harry have opened Edwig's cage if he...
14997,False,Help,False,Why did they wear it in turns knowing that it ...,harrypotter,self,Question about horcrux Salazar Slytherin's Locket


In [655]:
h_potter = h_potter[h_potter['selftext'] != '[removed]']
h_potter = h_potter[h_potter['selftext'] != '[deleted]']

In [656]:
h_potter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14700 entries, 1 to 14997
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   is_video         14700 non-null  bool  
 1   link_flair_text  9248 non-null   object
 2   over_18          14700 non-null  bool  
 3   selftext         9651 non-null   object
 4   subreddit        14700 non-null  object
 5   thumbnail        14700 non-null  object
 6   title            14700 non-null  object
dtypes: bool(2), object(5)
memory usage: 717.8+ KB


In [657]:
h_potter.drop_duplicates(ignore_index=True, inplace=True)

In [658]:
h_potter.shape

(14638, 7)

In [659]:
h_potter.dropna(inplace=True)

In [660]:
h_potter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6109 entries, 0 to 14637
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   is_video         6109 non-null   bool  
 1   link_flair_text  6109 non-null   object
 2   over_18          6109 non-null   bool  
 3   selftext         6109 non-null   object
 4   subreddit        6109 non-null   object
 5   thumbnail        6109 non-null   object
 6   title            6109 non-null   object
dtypes: bool(2), object(5)
memory usage: 298.3+ KB


In [662]:
h_potter[h_potter['selftext'] == '']

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,thumbnail,title


In [663]:
h_potter.head()

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,thumbnail,title
0,False,Discussion,False,Asking for a friend.,harrypotter,self,Could protego stop a nuclear blast?
1,False,Discussion,False,SPOILERS FOR GOBLET OF FIRE\n\n&amp;#x200B;\n\...,harrypotter,self,Goblet of Fire Rant/Discussion Post
4,False,Discussion,False,Watching HBP and seeing him in the orphanage g...,harrypotter,self,do you think voldemort would have been an obsc...
6,False,Question,False,I can’t remember where I read that half-bloods...,harrypotter,self,Can a pure-blood be a metamorphmagus?
10,False,Discussion,False,How different would the books be?,harrypotter,self,Do you guys think the franchise would be what ...


In [664]:
h_potter.shape

(6109, 7)

In [665]:
h_potter[h_potter['thumbnail'] == 'self']

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,thumbnail,title
0,False,Discussion,False,Asking for a friend.,harrypotter,self,Could protego stop a nuclear blast?
1,False,Discussion,False,SPOILERS FOR GOBLET OF FIRE\n\n&amp;#x200B;\n\...,harrypotter,self,Goblet of Fire Rant/Discussion Post
4,False,Discussion,False,Watching HBP and seeing him in the orphanage g...,harrypotter,self,do you think voldemort would have been an obsc...
6,False,Question,False,I can’t remember where I read that half-bloods...,harrypotter,self,Can a pure-blood be a metamorphmagus?
10,False,Discussion,False,How different would the books be?,harrypotter,self,Do you guys think the franchise would be what ...
...,...,...,...,...,...,...,...
14630,False,Discussion,False,We know that Voldemort's appearance after the ...,harrypotter,self,What was Voldemort's appearance before he was ...
14633,False,Misc,False,In Harry Potter and the Prisoner of Azkaban yo...,harrypotter,self,Ministry of Silly Walks
14634,False,Misc,False,I really hope she likes it. She's turning 13 i...,harrypotter,self,I've talked my sister into reading HP for the ...
14635,False,Discussion,False,"Of all the seven novels, Goblet of Fire is sub...",harrypotter,self,Goblet of Fire is the worst book


##### As most entries in "thumbnail" column are "self", decided to drop entire column.

In [666]:
h_potter.drop(columns='thumbnail', inplace=True)

In [667]:
h_potter.head()

Unnamed: 0,is_video,link_flair_text,over_18,selftext,subreddit,title
0,False,Discussion,False,Asking for a friend.,harrypotter,Could protego stop a nuclear blast?
1,False,Discussion,False,SPOILERS FOR GOBLET OF FIRE\n\n&amp;#x200B;\n\...,harrypotter,Goblet of Fire Rant/Discussion Post
4,False,Discussion,False,Watching HBP and seeing him in the orphanage g...,harrypotter,do you think voldemort would have been an obsc...
6,False,Question,False,I can’t remember where I read that half-bloods...,harrypotter,Can a pure-blood be a metamorphmagus?
10,False,Discussion,False,How different would the books be?,harrypotter,Do you guys think the franchise would be what ...


In [668]:
h_potter['is_video'].unique()

array([False])

Dropping 'is_video' column as all values are False.

In [669]:
h_potter.drop(columns='is_video', inplace=True)

In [670]:
h_potter.head()

Unnamed: 0,link_flair_text,over_18,selftext,subreddit,title
0,Discussion,False,Asking for a friend.,harrypotter,Could protego stop a nuclear blast?
1,Discussion,False,SPOILERS FOR GOBLET OF FIRE\n\n&amp;#x200B;\n\...,harrypotter,Goblet of Fire Rant/Discussion Post
4,Discussion,False,Watching HBP and seeing him in the orphanage g...,harrypotter,do you think voldemort would have been an obsc...
6,Question,False,I can’t remember where I read that half-bloods...,harrypotter,Can a pure-blood be a metamorphmagus?
10,Discussion,False,How different would the books be?,harrypotter,Do you guys think the franchise would be what ...


#### Exporting Cleaned DF

In [671]:
h_potter.to_csv('../datasets/h_potter_cleaned.csv', index=False)

#### Concating both subreddit DF's together into one DF.

In [672]:
subreddit_combined = pd.concat([marvel, h_potter])

#### Exporting Combined DF

In [674]:
subreddit_combined.to_csv('../datasets/subreddit_combined.csv', index=False)