## Import Libraries

In [1]:
import regex as re
import pandas as pd
from bs4 import BeautifulSoup

### Set Up Options

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Import dataset

In [1476]:
df=pd.read_csv('../datasets/subreddit_submissions.csv')

In [1477]:
df.head()

Unnamed: 0,id,title,selftext,subreddit
0,hnsorg,Daredevil by Bill Sienkiewicz,,Marvel
1,hnsqr9,That's probably the most obvious example of Ul...,,Marvel
2,hnsvgp,Picture of Spidey I drew...,[removed],Marvel
3,hnswrl,Just a little drawing I did,,Marvel
4,hnt890,That suit did nothing for his ass.,,Marvel


In [1478]:
#Explore unique values for 'title' and 'selftext' columns

In [1479]:
df['title'].nunique()

2957

In [1480]:
df['selftext'].nunique()

647

In [1481]:
df['id'].nunique()

3000

In [1482]:
#Since 'selftext' has a lot of non unique values, it will be dropped
df.drop(columns='selftext',inplace=True)

In [1483]:
df.head()

Unnamed: 0,id,title,subreddit
0,hnsorg,Daredevil by Bill Sienkiewicz,Marvel
1,hnsqr9,That's probably the most obvious example of Ul...,Marvel
2,hnsvgp,Picture of Spidey I drew...,Marvel
3,hnswrl,Just a little drawing I did,Marvel
4,hnt890,That suit did nothing for his ass.,Marvel


In [1484]:
df.isnull().sum()

id           0
title        0
subreddit    0
dtype: int64

In [1485]:
#Check titles for automated comments by moderators

In [1486]:
df[df['title'].str.contains('Your post has been automatically removed')]

Unnamed: 0,id,title,subreddit


In [1487]:
df[df['title'].str.contains('PLEASE READ')]

Unnamed: 0,id,title,subreddit


In [1488]:
#Removing unwanted artifacts like html,http,www etc

In [1489]:
df[:100]

Unnamed: 0,id,title,subreddit
0,hnsorg,Daredevil by Bill Sienkiewicz,Marvel
1,hnsqr9,That's probably the most obvious example of Ul...,Marvel
2,hnsvgp,Picture of Spidey I drew...,Marvel
3,hnswrl,Just a little drawing I did,Marvel
4,hnt890,That suit did nothing for his ass.,Marvel
5,hntsu3,Research on the X-Men,Marvel
6,hnu2c9,How to collect the current X-Men series,Marvel
7,hnubpe,There's no official Funko of my favorite villa...,Marvel
8,hnudjl,Something random,Marvel
9,hnumpm,"May I present Cable, Cable and Cable",Marvel


In [1490]:
#Drop rows where 'title' is duplicate
df.drop_duplicates(subset='title', keep='first', inplace=True)

In [1491]:
#matches the unique 'title' count
df.shape

(2957, 3)

In [1492]:
#Search for https
df[df['title'].str.contains('https:')]

Unnamed: 0,id,title,subreddit
190,gzmggj,My new channel! https://youtu.be/plwvG4Z158Y,Marvel
743,e6igwv,IF you love marvel the only thing you couldn't...,Marvel
784,e6qhll,i’ll just leave this here. it’s captain marvel...,Marvel


In [1493]:
#clear unwanted elements
df['title'] = df['title'].str.replace('https://', '').astype('str')

In [1494]:
df[df['title'].str.contains('https:')]

Unnamed: 0,id,title,subreddit


In [1495]:
df[df['title'].str.contains('/r/')]

Unnamed: 0,id,title,subreddit
1050,cz5c6z,A Pathfinder RPG Character Conversion Guide Fo...,Marvel
2152,eki5v7,Here they are: your /r/DCcomics Best of 2019 W...,DCcomics
2214,e6b7yt,/r/dccomics enters TOP 5000 subreddits,DCcomics


In [1496]:
df['title'] = df['title'].str.replace('/r/', '').astype('str')

In [1497]:
df[df['title'].str.contains('/r/')]

Unnamed: 0,id,title,subreddit


In [1498]:
df['title'][2152]

'Here they are: your DCcomics Best of 2019 Winners!'

In [1499]:
df[df['title'].str.contains('@[a-zA-Z0-9\.\_]*')]['title']

7       There's no official Funko of my favorite villa...
35             It's past bedtime Groot! By Me: @sornetart
75      FAN ART of Spider-Man swinging through the cit...
175        MK85 "endgame", video challenge. / IG @caiocdk
185     My first piece of my comic leg. My favourite p...
250     I posted this Dr. Strange artwork I did a whil...
255     Logan by @bananashapedhelicopter, could use so...
323     Who's Hyped For This (art frm my insta @abgrafix)
360                      the berserker (ig @tonywhiteart)
464     Avengers inspired tattoo on me by @klaaane on ...
469     Thanos Wallpaper (Series 1) #3 By @RedHawksFF ...
505            "The First Avenger" by IG: @bobasaurusfett
803     A marvel WHAT IF.. with Iron Man and Terninato...
842     Spiderman into the spider-verse 2 GLITCH POSTE...
862                Pretty Dope Wanda Cosplay by @marz.z.z
883     Spiderman into the spider-verse 2 GLITCH😍😍 POS...
917     My first attempt at Deadpool. I know its not c...
1000    Made a

In [1500]:
df['title']=df['title'].str.replace('@[a-zA-Z0-9\.\_]*','',).astype('str')

In [1501]:
df[df['title'].str.contains('@[a-zA-Z0-9\.\_]*')]['title']

Series([], Name: title, dtype: object)

In [1502]:
df['title'][35]

"It's past bedtime Groot! By Me: "

In [1503]:
df[df['title'].str.contains('\d')]['title']

1       That's probably the most obvious example of Ul...
12      [Agents of S.H.I.E.L.D.] S07E07 - "The Totally...
13      Russell Dauterman is wonderful. [Giant-Size X-...
14      Cake day gifts for my real cake day 5 days bef...
19      Are "The Union", "Strikeforce #10" and "The Ma...
26      Zub confirms Black Panther and the Agents of W...
27      11 hours worth of work and 5 of them being a a...
32                      Infinity Portfolio Sample Page #1
37      So I’ve had this T-shirt since 1998 and it’s t...
44                                  Marvel 1670-Spiderman
46      r/Marvel Flashback Discussion #15 - Ed Brubake...
47      Iron Man faces off against the Mandarin's "Ult...
49      This Week in Comics #22 - JUL 8 2020 - X-FORCE...
56                      IRON MAN (2008) by Sahin Duezguen
58      Thor #6 variant cover by Gabriele Dell'Otto [A...
60                   Preview: iWolverine #1 by Larry Hama
67      Question for deciding between Ultimate Marvel ...
69      Agents

In [1504]:
df[df['title'].str.contains('#')]['title']

1       That's probably the most obvious example of Ul...
13      Russell Dauterman is wonderful. [Giant-Size X-...
19      Are "The Union", "Strikeforce #10" and "The Ma...
26      Zub confirms Black Panther and the Agents of W...
32                      Infinity Portfolio Sample Page #1
46      r/Marvel Flashback Discussion #15 - Ed Brubake...
47      Iron Man faces off against the Mandarin's "Ult...
49      This Week in Comics #22 - JUL 8 2020 - X-FORCE...
58      Thor #6 variant cover by Gabriele Dell'Otto [A...
60                   Preview: iWolverine #1 by Larry Hama
76                       Immortal Hulk Threshing Place #1
79            Darkseid destroys the DC universe (Thor #2)
87                                [Fan Art] #REVIVESCREAM
97             Any idea if morbius #6 is still happening?
98      What futuristic tech looked like in 2005 - Iro...
119     Only 1 Out Of The 5 Worst Corona-Affected Stat...
130     The lengths creators go to in order to justify...
143           

In [1505]:
df['title']=df['title'].str.replace('\d','',).astype('str')

In [1506]:
df['title']=df['title'].str.replace('#','',).astype('str')

In [1507]:
df['title'][79]

'Darkseid destroys the DC universe (Thor )'

In [1508]:
df[df['title'].str.contains(' r\/')]

Unnamed: 0,id,title,subreddit
2764,c8t55g,Pleased to announce that r/Starfire is now an ...,DCcomics


In [1509]:
df['title']=df['title'].str.replace('r\/',' ').astype('str')

In [1511]:
df['title'][2764]

'Pleased to announce that  Starfire is now an active community!'

In [1512]:
#Remove Punctuation

In [1513]:
df[:50]

Unnamed: 0,id,title,subreddit
0,hnsorg,Daredevil by Bill Sienkiewicz,Marvel
1,hnsqr9,That's probably the most obvious example of Ul...,Marvel
2,hnsvgp,Picture of Spidey I drew...,Marvel
3,hnswrl,Just a little drawing I did,Marvel
4,hnt890,That suit did nothing for his ass.,Marvel
5,hntsu3,Research on the X-Men,Marvel
6,hnu2c9,How to collect the current X-Men series,Marvel
7,hnubpe,There's no official Funko of my favorite villa...,Marvel
8,hnudjl,Something random,Marvel
9,hnumpm,"May I present Cable, Cable and Cable",Marvel


In [1514]:
df["title"] = df['title'].str.replace('[^\w\s]','')

In [1515]:
df[:50]

Unnamed: 0,id,title,subreddit
0,hnsorg,Daredevil by Bill Sienkiewicz,Marvel
1,hnsqr9,Thats probably the most obvious example of Ult...,Marvel
2,hnsvgp,Picture of Spidey I drew,Marvel
3,hnswrl,Just a little drawing I did,Marvel
4,hnt890,That suit did nothing for his ass,Marvel
5,hntsu3,Research on the XMen,Marvel
6,hnu2c9,How to collect the current XMen series,Marvel
7,hnubpe,Theres no official Funko of my favorite villai...,Marvel
8,hnudjl,Something random,Marvel
9,hnumpm,May I present Cable Cable and Cable,Marvel


In [1516]:
#Words with low significance
df[df['title'].str.contains('S[pP][oO][iI][lL][eE][rR][sS]|[aA][rR][tT][wW][oO][rR][kK]|E[xX][cC][eE][rR][pP][tT]')].head()

Unnamed: 0,id,title,subreddit
12,hnuoe9,Agents of SHIELD SE The Totally Excellent Adv...,Marvel
58,ho731x,Thor variant cover by Gabriele DellOtto Artwork,Marvel
63,ho844p,ARTWORK marvel vs capcom inspired carnage and ...,Marvel
213,gfl0ij,Ive spent hours trying to find all the charac...,Marvel
250,gfw4cj,I posted this Dr Strange artwork I did a while...,Marvel


In [1517]:
df["title"] = df['title'].str.replace('S[pP][oO][iI][lL][eE][rR][sS]|A[rR][tT][wW][oO][rR][kK]|E[xX][cC][eE][rR][pP][tT]','')

In [1518]:
df['title'][19]

'Are The Union Strikeforce  and The Marvels Still Going To Be Released'

In [1519]:
df[df['title'].str.contains("\s\s+")].head()

Unnamed: 0,id,title,subreddit
1,hnsqr9,Thats probably the most obvious example of Ult...,Marvel
7,hnubpe,Theres no official Funko of my favorite villai...,Marvel
12,hnuoe9,Agents of SHIELD SE The Totally Excellent Adv...,Marvel
14,hnvc4p,Cake day gifts for my real cake day days befo...,Marvel
19,hnw5sb,Are The Union Strikeforce and The Marvels Sti...,Marvel


In [1520]:
df['title'][19]

'Are The Union Strikeforce  and The Marvels Still Going To Be Released'

In [1521]:
df["title"] = df['title'].str.replace("\s\s+",' ')

In [1522]:
df['title'][19]

'Are The Union Strikeforce and The Marvels Still Going To Be Released'

In [1523]:
df['title']=df['title'].str.replace(r'\b\w\b','')

In [1524]:
#after cleaning look for empty strings in Titles
len(df[df['title']==''])

3

In [1525]:
df.loc[df['title']=='']

Unnamed: 0,id,title,subreddit
780,e6pfbn,,Marvel
797,e6tlbs,,Marvel
1131,clgzob,,Marvel


In [1526]:
#drop empty title rows
df.drop([780,797,1131],inplace=True)

In [1527]:
df.shape

(2954, 3)

In [1528]:
len(df[df['title']==''])

0

In [1529]:
#check for Bot posts

In [1530]:
df[df['title'].str.contains(" [bB][oO][tT] ")]

Unnamed: 0,id,title,subreddit
841,drj1hb,Купить продукты цветы за криптовалюту Prizm ч...,Marvel


In [1531]:
#Drop row with non english strings
df.drop([841],inplace=True)

In [1532]:
#Fix Index
df.reset_index(inplace=True)

In [1533]:
df.shape

(2953, 4)

In [1453]:
df

Unnamed: 0,index,id,title,subreddit
0,0,hnsorg,Daredevil by Bill Sienkiewicz,Marvel
1,1,hnsqr9,Thats probably the most obvious example of Ult...,Marvel
2,2,hnsvgp,Picture of Spidey drew,Marvel
3,3,hnswrl,Just little drawing did,Marvel
4,4,hnt890,That suit did nothing for his ass,Marvel
5,5,hntsu3,Research on the XMen,Marvel
6,6,hnu2c9,How to collect the current XMen series,Marvel
7,7,hnubpe,Theres no official Funko of my favorite villai...,Marvel
8,8,hnudjl,Something random,Marvel
9,9,hnumpm,May present Cable Cable and Cable,Marvel


In [1535]:
df.drop(columns=['index'],inplace=True)

In [1536]:
df

Unnamed: 0,id,title,subreddit
0,hnsorg,Daredevil by Bill Sienkiewicz,Marvel
1,hnsqr9,Thats probably the most obvious example of Ult...,Marvel
2,hnsvgp,Picture of Spidey drew,Marvel
3,hnswrl,Just little drawing did,Marvel
4,hnt890,That suit did nothing for his ass,Marvel
5,hntsu3,Research on the XMen,Marvel
6,hnu2c9,How to collect the current XMen series,Marvel
7,hnubpe,Theres no official Funko of my favorite villai...,Marvel
8,hnudjl,Something random,Marvel
9,hnumpm,May present Cable Cable and Cable,Marvel


In [1537]:
#Cleaned Marvel data csv
marvel_df=df.iloc[0:1482].copy()

In [1538]:
marvel_df.shape

(1482, 3)

In [1539]:
marvel_df

Unnamed: 0,id,title,subreddit
0,hnsorg,Daredevil by Bill Sienkiewicz,Marvel
1,hnsqr9,Thats probably the most obvious example of Ult...,Marvel
2,hnsvgp,Picture of Spidey drew,Marvel
3,hnswrl,Just little drawing did,Marvel
4,hnt890,That suit did nothing for his ass,Marvel
5,hntsu3,Research on the XMen,Marvel
6,hnu2c9,How to collect the current XMen series,Marvel
7,hnubpe,Theres no official Funko of my favorite villai...,Marvel
8,hnudjl,Something random,Marvel
9,hnumpm,May present Cable Cable and Cable,Marvel


In [1540]:
#Export cleaned marvel data into a new csv file
marvel_df.to_csv('marvel_submissions_cleaned.csv', index=False)

In [1541]:
#Cleaned DCComics data csv
dc_df=df.iloc[1483:2952].copy()

In [1542]:
dc_df.reset_index(inplace=True)

In [1543]:
dc_df

Unnamed: 0,index,id,title,subreddit
0,1483,hnsnr2,Couple of Chuck Dixon questions,DCcomics
1,1484,hnsrpm,Tony Daniel Some WIP from DeathMetal Legends...,DCcomics
2,1485,hnstjs,Theatrical JL vs Snyder Cut JL,DCcomics
3,1486,hnsvxj,DC Post Death Metal,DCcomics
4,1487,hnt2xa,DC cosmic characters,DCcomics
5,1488,hnt6cx,Television think Renee Montoya would have bee...,DCcomics
6,1489,hnt81i,Discussion Would going with Renee Montoya have...,DCcomics
7,1490,hntx8p,Fan Art Did little Aquaman Logo,DCcomics
8,1491,hnu9nq,Martha joker Wayne from datrinti,DCcomics
9,1492,hnugbd,The Curious Case of Apache Chief Featuring The...,DCcomics


In [1548]:
dc_df.drop(columns=['index'],inplace=True)

In [1549]:
#Export cleaned dc data into a new csv file
dc_df.to_csv('../datasets/dc_submissions_cleaned.csv', index=False)

In [1550]:
df.shape

(2953, 3)

In [1547]:
#Export cleaned data into a new csv file
df.to_csv('../datasets/cleaned_submissions.csv', index=False)