# Read in libraries and Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
science_df = pd.read_csv('./data/raw_data/reddit_science.csv')
technology_df = pd.read_csv('./data/raw_data/reddit_technology.csv')

# Science Data Frame Cleaning

In [3]:
science_df.head(3)

Unnamed: 0.1,Unnamed: 0,subreddit,title,selftext,author,created_utc,upvote_ratio,full_link,num_comments,num_crossposts,total_awards_received,score,over_18
0,0,science,A Climate in Crisis Calls for Investment in Di...,,Wagamaga,1610798299,1.0,https://www.reddit.com/r/science/comments/kyhp...,6,0,0,1,
1,1,science,Cureus | Telemedicine: Current Impact on the F...,,CureusJournal,1610798085,1.0,https://www.reddit.com/r/science/comments/kyho...,0,0,0,1,
2,2,science,Magnetic reconnection as a mechanism for energ...,,m3prx,1610792298,1.0,https://www.reddit.com/r/science/comments/kygi...,2,0,0,1,


### Null values

In [4]:
science_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             5000 non-null   int64  
 1   subreddit              5000 non-null   object 
 2   title                  5000 non-null   object 
 3   selftext               128 non-null    object 
 4   author                 5000 non-null   object 
 5   created_utc            5000 non-null   int64  
 6   upvote_ratio           5000 non-null   float64
 7   full_link              5000 non-null   object 
 8   num_comments           5000 non-null   int64  
 9   num_crossposts         5000 non-null   int64  
 10  total_awards_received  5000 non-null   int64  
 11  score                  5000 non-null   int64  
 12  over_18                0 non-null      float64
dtypes: float64(2), int64(6), object(5)
memory usage: 507.9+ KB


- Mostly null values in 'selftext'
- All null values in over_18

### Drop columns

In [5]:
science_df.drop(columns = 'Unnamed: 0', inplace=True)
science_df.drop(columns = 'over_18', inplace=True)
science_df.drop(columns = 'selftext', inplace=True)

### Value Counts

In [6]:
science_df['subreddit'].value_counts()

science    5000
Name: subreddit, dtype: int64

In [7]:
science_df['title'].value_counts()

Scientists believe that the function of zebras' stripes are to deter insects, so a team of researchers painted black and white stripes on cows. They found that it reduced the number of biting flies landing on the cows by more than 50%.                                                                    19
Scientists replaced 40 percent of cement with rice husk cinder, limestone crushing waste, and silica sand, giving concrete a rubber-like quality, six to nine times more crack-resistant than regular concrete. It self-seals, replaces cement with plentiful waste products, and should be cheaper to use.    18
No evidence to support link between violent video games and behaviour - Researchers at the University of York have found no evidence to support the theory that video games make players more violent.                                                                                                         16
From 2007 to 2017, the number of suicides among people ages 10 to 24 increased 56 

- Possible duplicate posts

In [8]:
science_df['author'].value_counts()

mvea               276
rustoo             166
Wagamaga           142
[deleted]          123
MistWeaver80       104
                  ... 
ac13057              1
myelini              1
cybershocker455      1
the_battousai89      1
barwick11            1
Name: author, Length: 2080, dtype: int64

- Noticed a select few authors have submitted a lot of posts

In [9]:
science_df['upvote_ratio'].value_counts()

1.00    4694
0.50      25
0.67      21
0.75      21
0.99      20
        ... 
0.61       1
0.17       1
0.20       1
0.42       1
0.63       1
Name: upvote_ratio, Length: 61, dtype: int64

- most upvotes ratio are 1, possibly not updated as this is a recent dataset

In [10]:
science_df['num_comments'].value_counts()

0       1375
2        923
1        650
3        307
4        235
        ... 
162        1
2207       1
154        1
2187       1
2043       1
Name: num_comments, Length: 336, dtype: int64

- most posts have 0 comments

In [11]:
science_df['num_crossposts'].value_counts()

0     4962
1       28
2        3
23       1
11       1
7        1
3        1
16       1
8        1
4        1
Name: num_crossposts, dtype: int64

- most posts have 0 crossposts

In [12]:
science_df['total_awards_received'].value_counts()

0     4983
1        5
2        3
4        2
79       1
31       1
30       1
33       1
13       1
5        1
32       1
Name: total_awards_received, dtype: int64

- most posts have 0 total awards

In [13]:
science_df['score'].value_counts()

1       4607
2         65
0         49
4         43
3         43
        ... 
489        1
1031       1
273        1
205        1
2428       1
Name: score, Length: 78, dtype: int64

- most posts have a score of 1

# Technology Data Cleaning

In [14]:
technology_df.head(3)

Unnamed: 0.1,Unnamed: 0,subreddit,title,selftext,author,created_utc,upvote_ratio,full_link,num_comments,num_crossposts,total_awards_received,score,over_18
0,0,technology,samsung galaxy s12 ultra 5g,,TechGreed123,1610798204,1.0,https://www.reddit.com/r/technology/comments/k...,0,0,0,1,
1,1,technology,Top 10 Internet of Things (IoT) Books to Read ...,,globaltechoutlook,1610797957,1.0,https://www.reddit.com/r/technology/comments/k...,0,0,0,1,
2,2,technology,Message de test vraie hein,,groupsaada,1610797574,1.0,https://www.reddit.com/r/technology/comments/k...,0,0,0,1,


### Null Values

In [15]:
technology_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             5000 non-null   int64  
 1   subreddit              5000 non-null   object 
 2   title                  5000 non-null   object 
 3   selftext               0 non-null      float64
 4   author                 5000 non-null   object 
 5   created_utc            5000 non-null   int64  
 6   upvote_ratio           5000 non-null   float64
 7   full_link              5000 non-null   object 
 8   num_comments           5000 non-null   int64  
 9   num_crossposts         5000 non-null   int64  
 10  total_awards_received  5000 non-null   int64  
 11  score                  5000 non-null   int64  
 12  over_18                0 non-null      float64
dtypes: float64(3), int64(6), object(4)
memory usage: 507.9+ KB


- Selftext and over_18 have all null values

### Drop Columns

In [16]:
technology_df.drop(columns = 'Unnamed: 0', inplace=True)
technology_df.drop(columns = 'over_18', inplace=True)
technology_df.drop(columns = 'selftext', inplace=True)

### Value Counts

In [17]:
technology_df['subreddit'].value_counts()

technology    5000
Name: subreddit, dtype: int64

In [18]:
technology_df['title'].value_counts()

Message de test vraie hein                                                             1451
Best Top 10 Cheap price laptops [2021]                                                   10
How To Buy A Laptop At Less Price….?                                                     10
strawberries on the blockchain                                                            6
Cash key loan customer care number☎️9572886345.all problem solved                         6
                                                                                       ... 
11 Apps Like Lucky Patcher For Download Paid Apps Free 2021                               1
HTC Desire 21 Pro 5G Snapdragon 690 launched with 5G processor, know other features       1
GET TECHNOLOGY                                                                            1
The KFConsole is real                                                                     1
Trump pushes for the development of nuclear energy for space                    

- Seems to be a large amount of test posts

In [19]:
technology_df['author'].value_counts()

groupsaada             1451
arnambiar               199
onlinepeepsofficial     126
EnterpriseNews_Elf      125
alexharry0003            94
                       ... 
wrestlerfan541            1
its-Loki                  1
obsd92107                 1
current-asscoverer        1
techfreakzz               1
Name: author, Length: 1585, dtype: int64

- A few authors have a lot of submitted posts

In [20]:
technology_df['upvote_ratio'].value_counts()

1.00    4875
0.67      31
0.50      21
0.75      18
0.80      14
0.60       5
0.86       5
0.33       4
0.83       4
0.88       4
0.89       3
0.90       2
0.57       2
0.25       2
0.40       2
0.96       1
0.43       1
0.99       1
0.63       1
0.95       1
0.91       1
0.82       1
0.92       1
Name: upvote_ratio, dtype: int64

- most posts have upvote ratio of 1

In [21]:
technology_df['num_comments'].value_counts()

0      2962
2      1125
1       167
3       136
4        92
       ... 
118       1
122       1
126       1
154       1
985       1
Name: num_comments, Length: 152, dtype: int64

- most posts have 0-2 comments

In [22]:
technology_df['num_crossposts'].value_counts()

0    4987
1      12
2       1
Name: num_crossposts, dtype: int64

- most posts have 0 crossposts

In [23]:
technology_df['total_awards_received'].value_counts()

0    4997
1       3
Name: total_awards_received, dtype: int64

- most posts have 0 awards received

In [24]:
technology_df['score'].value_counts()

1     4749
2       98
3       44
0       30
4       28
5       18
7        9
6        7
8        5
10       3
12       2
17       2
28       1
13       1
11       1
14       1
47       1
Name: score, dtype: int64

- most posts have score of 1

In [25]:
subreddits_df = pd.concat(objs=[science_df, technology_df], axis=0)
subreddits_df.head()

Unnamed: 0,subreddit,title,author,created_utc,upvote_ratio,full_link,num_comments,num_crossposts,total_awards_received,score
0,science,A Climate in Crisis Calls for Investment in Di...,Wagamaga,1610798299,1.0,https://www.reddit.com/r/science/comments/kyhp...,6,0,0,1
1,science,Cureus | Telemedicine: Current Impact on the F...,CureusJournal,1610798085,1.0,https://www.reddit.com/r/science/comments/kyho...,0,0,0,1
2,science,Magnetic reconnection as a mechanism for energ...,m3prx,1610792298,1.0,https://www.reddit.com/r/science/comments/kygi...,2,0,0,1
3,science,Sprinkle of chili compound boosts perovskite s...,MistWeaver80,1610779333,1.0,https://www.reddit.com/r/science/comments/kydw...,2,0,0,1
4,science,Ancient Roman concrete mineral found strengthe...,MistWeaver80,1610778779,1.0,https://www.reddit.com/r/science/comments/kyds...,1,0,0,1


In [26]:
subreddits_df

Unnamed: 0,subreddit,title,author,created_utc,upvote_ratio,full_link,num_comments,num_crossposts,total_awards_received,score
0,science,A Climate in Crisis Calls for Investment in Di...,Wagamaga,1610798299,1.0,https://www.reddit.com/r/science/comments/kyhp...,6,0,0,1
1,science,Cureus | Telemedicine: Current Impact on the F...,CureusJournal,1610798085,1.0,https://www.reddit.com/r/science/comments/kyho...,0,0,0,1
2,science,Magnetic reconnection as a mechanism for energ...,m3prx,1610792298,1.0,https://www.reddit.com/r/science/comments/kygi...,2,0,0,1
3,science,Sprinkle of chili compound boosts perovskite s...,MistWeaver80,1610779333,1.0,https://www.reddit.com/r/science/comments/kydw...,2,0,0,1
4,science,Ancient Roman concrete mineral found strengthe...,MistWeaver80,1610778779,1.0,https://www.reddit.com/r/science/comments/kyds...,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4995,technology,Message de test vraie hein,groupsaada,1608547204,1.0,https://www.reddit.com/r/technology/comments/k...,0,0,0,1
4996,technology,Huawei Mate 40 Pro + conquers the top of the D...,onlinepeepsofficial,1608547065,1.0,https://www.reddit.com/r/technology/comments/k...,0,0,0,1
4997,technology,Remote desktop according to Lenovo: the import...,onlinepeepsofficial,1608547064,1.0,https://www.reddit.com/r/technology/comments/k...,0,0,0,1
4998,technology,"realme X3 Super Zoom, BOSE NC headphones and X...",onlinepeepsofficial,1608547063,1.0,https://www.reddit.com/r/technology/comments/k...,0,0,0,1


### Drop duplicates

In [27]:
subreddits_df = subreddits_df.drop_duplicates(subset=['title'])

In [28]:
subreddits_df.shape

(7976, 10)

In [29]:
science_df = science_df.drop_duplicates(subset=['title'])

In [30]:
science_df.shape

(4563, 10)

In [31]:
technology_df = technology_df.drop_duplicates(subset=['title'])

In [32]:
technology_df.shape

(3426, 10)

# Export Clean & Combined Datasets

In [33]:
science_df.to_csv('./data/reddit_science_clean.csv', index=False)

In [34]:
technology_df.to_csv('./data/reddit_technology_clean.csv', index=False)

In [35]:
subreddits_df.to_csv('./data/posts_combined_clean.csv', index=False)