# Read in libraries and Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
sci_comments = pd.read_csv('./data/raw_data/posts_master_science_comments.csv')
tech_comments = pd.read_csv('./data/raw_data/posts_master_technology_comments.csv')

# Science Data Frame Cleaning

In [3]:
sci_comments.head(3)

Unnamed: 0.1,Unnamed: 0,body,subreddit,created_utc,author,permalink,id,num_crossposts,total_awards_received,score
0,0,[removed],science,1610798385,[deleted],,,,,
1,1,I'm the oogey boogey man,science,1610798381,Rightintwo7,,,,,
2,2,"I mean, it’s not only small conveniences. Imag...",science,1610798369,Elfetzo,,,,,


### Null values

In [4]:
sci_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             5000 non-null   int64  
 1   body                   5000 non-null   object 
 2   subreddit              5000 non-null   object 
 3   created_utc            5000 non-null   int64  
 4   author                 5000 non-null   object 
 5   permalink              0 non-null      float64
 6   id                     0 non-null      float64
 7   num_crossposts         0 non-null      float64
 8   total_awards_received  0 non-null      float64
 9   score                  0 non-null      float64
dtypes: float64(5), int64(2), object(3)
memory usage: 390.8+ KB


### Drop columns

In [5]:
# drop all null columns
sci_comments.drop(columns = 'Unnamed: 0', inplace=True)
sci_comments.drop(columns = 'permalink', inplace=True)
sci_comments.drop(columns = 'id', inplace=True)
sci_comments.drop(columns = 'num_crossposts', inplace=True)
sci_comments.drop(columns = 'total_awards_received', inplace=True)
sci_comments.drop(columns = 'score', inplace=True)

### Value Counts

In [6]:
sci_comments['subreddit'].value_counts()

science    5000
Name: subreddit, dtype: int64

In [7]:
sci_comments['body'].value_counts()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           1609
[deleted]                                                                                                                                                                                                                                                                                                                                                                                                      

- Possible duplicate posts

In [8]:
sci_comments['author'].value_counts()

[deleted]            1736
merithynos             26
EscapeVelocity83       15
shiruken               15
trevor32192            15
                     ... 
kingofafro              1
American_Comrad69       1
tokyojunior             1
schweez                 1
pinksandstrom           1
Name: author, Length: 2189, dtype: int64

# Technology Data Cleaning

In [9]:
tech_comments.head(3)

Unnamed: 0.1,Unnamed: 0,body,subreddit,created_utc,author,permalink,id,num_crossposts,total_awards_received,score
0,0,&gt; The try really doesn’t matter if the end ...,technology,1610798375,flybypost,,,,,
1,1,Hello! **Please read this message very careful...,technology,1610798373,AutoModerator,,,,,
2,2,Societal media without overarching censorship?...,technology,1610798365,youreillusional,,,,,


### Null Values

In [10]:
tech_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             5000 non-null   int64  
 1   body                   5000 non-null   object 
 2   subreddit              5000 non-null   object 
 3   created_utc            5000 non-null   int64  
 4   author                 5000 non-null   object 
 5   permalink              0 non-null      float64
 6   id                     0 non-null      float64
 7   num_crossposts         0 non-null      float64
 8   total_awards_received  0 non-null      float64
 9   score                  0 non-null      float64
dtypes: float64(5), int64(2), object(3)
memory usage: 390.8+ KB


### Drop Columns

In [11]:
#drop all null columns
tech_comments.drop(columns = 'Unnamed: 0', inplace=True)
tech_comments.drop(columns = 'permalink', inplace=True)
tech_comments.drop(columns = 'id', inplace=True)
tech_comments.drop(columns = 'num_crossposts', inplace=True)
tech_comments.drop(columns = 'total_awards_received', inplace=True)
tech_comments.drop(columns = 'score', inplace=True)

### Value Counts

In [12]:
tech_comments['subreddit'].value_counts()

technology    5000
Name: subreddit, dtype: int64

In [13]:
tech_comments['body'].value_counts()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

- May have duplicate posts

In [14]:
tech_comments['author'].value_counts()

[deleted]               446
MegaAcumen               62
bruhsbruhsbruhs          60
AutoModerator            52
simpathyforthedevil      26
                       ... 
jetforcegemini            1
Method__Man               1
DanReach                  1
blendedfarley             1
Reddit_as_Screenplay      1
Name: author, Length: 2789, dtype: int64

- many [deleted] posts

In [15]:
comments_df = pd.concat(objs=[sci_comments, tech_comments], axis=0)
comments_df.shape

(10000, 4)

In [16]:
comments_df

Unnamed: 0,body,subreddit,created_utc,author
0,[removed],science,1610798385,[deleted]
1,I'm the oogey boogey man,science,1610798381,Rightintwo7
2,"I mean, it’s not only small conveniences. Imag...",science,1610798369,Elfetzo
3,There is a growing consensus among scientists ...,science,1610798359,Wagamaga
4,I wouldn't put your hopes on it. I work with n...,science,1610798294,Down_The_Rabbithole
...,...,...,...,...
4995,I'm saying insurrection isn't a valid reason t...,technology,1610745997,ApocDream
4996,"hate to be a debby downer, but its like the ca...",technology,1610745995,rolex_chaser
4997,for a while our drummer used to reherse with a...,technology,1610745992,NightRavenFSZ
4998,[deleted],technology,1610745986,[deleted]


### Drop duplicates

In [17]:
comments_df = comments_df.drop_duplicates(subset=['body'])

In [18]:
comments_df.shape

(7730, 4)

In [19]:
sci_comments = sci_comments.drop_duplicates(subset=['body'])

In [20]:
sci_comments.shape

(3246, 4)

In [21]:
sci_comments.head(3)

Unnamed: 0,body,subreddit,created_utc,author
0,[removed],science,1610798385,[deleted]
1,I'm the oogey boogey man,science,1610798381,Rightintwo7
2,"I mean, it’s not only small conveniences. Imag...",science,1610798369,Elfetzo


In [22]:
tech_comments = tech_comments.drop_duplicates(subset=['body'])

In [23]:
tech_comments.shape

(4486, 4)

In [24]:
tech_comments.head(3)

Unnamed: 0,body,subreddit,created_utc,author
0,&gt; The try really doesn’t matter if the end ...,technology,1610798375,flybypost
1,Hello! **Please read this message very careful...,technology,1610798373,AutoModerator
2,Societal media without overarching censorship?...,technology,1610798365,youreillusional


# Export Clean & Combined Datasets

In [25]:
sci_comments.to_csv('./data/science_comments_clean.csv', index=False)

In [26]:
tech_comments.to_csv('./data/technology_comments_clean.csv', index=False)

In [27]:
comments_df.to_csv('./data/comments_combined_clean.csv', index=False)