In [None]:
# Doing the necessary import to carry out data cleaning. 
import pandas as pd

In [2]:
# Read df.
df = pd.read_csv('../datasets/df.csv')
print(df.shape)
df.head()

(39633, 5)


Unnamed: 0,subreddit,author,domain,title,selftext
0,Anger,RIPplzHelpMeRN,self.Anger,I Have Anger 'Problems' Apparently,"In my opinion it is just my natural feelings, ..."
1,Anger,aboowwabooww,self.Anger,I need help,I have undiagnosed and untreated adhd since 15...
2,Anger,69andeverything,self.Anger,How can someone who doesn't get angry help my ...,"I (17F) never get angry, it's something I find..."
3,Anger,mailception,self.Anger,how I make it stop ? anyone please ?,All I can explain is a deep rooted anger I fee...
4,Anger,lemonsandrosemary,self.Anger,Shattered a Window Today,"Just like the title says. Live on my own, Fath..."


In [3]:
# Checking for missing cells.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39633 entries, 0 to 39632
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subreddit  39633 non-null  object
 1   author     39633 non-null  object
 2   domain     39631 non-null  object
 3   title      39633 non-null  object
 4   selftext   19891 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


Quite a number of rows in selftext that is empty. I will be combining title with selftext to analyse together.

In [6]:
# There are some rows in selftext that are null. 
# I fillna with '' so that further down, I can combine title and selftext as a single column.
df.fillna('',inplace=True)

In [7]:
# Combining title and selftext.
df['post'] = df['title'] + " " + df['selftext'].astype(str)

df.head()

Unnamed: 0,subreddit,author,domain,title,selftext,post
0,Anger,RIPplzHelpMeRN,self.Anger,I Have Anger 'Problems' Apparently,"In my opinion it is just my natural feelings, ...",I Have Anger 'Problems' Apparently In my opini...
1,Anger,aboowwabooww,self.Anger,I need help,I have undiagnosed and untreated adhd since 15...,I need help I have undiagnosed and untreated a...
2,Anger,69andeverything,self.Anger,How can someone who doesn't get angry help my ...,"I (17F) never get angry, it's something I find...",How can someone who doesn't get angry help my ...
3,Anger,mailception,self.Anger,how I make it stop ? anyone please ?,All I can explain is a deep rooted anger I fee...,how I make it stop ? anyone please ? All I can...
4,Anger,lemonsandrosemary,self.Anger,Shattered a Window Today,"Just like the title says. Live on my own, Fath...",Shattered a Window Today Just like the title s...


In [8]:
# Checking that there are no more null cells.
df.isna().sum()

subreddit    0
author       0
domain       0
title        0
selftext     0
post         0
dtype: int64

In [9]:
# Checking on the rows in the 'subreddit' column.
df['subreddit'].unique()

array(['Anger', '30473', '29535', 'exasperations', 'rage', 'vex',
       'disgusting', 'Cum flavoured wings', 'Licking A Cow Patty',
       'awfuleverything', 'Yuck', 'fear',
       'Клаустрофобия | как избавиться от клаустрофобии | боязнь замкнутого пространства - YouTube',
       'психоанализ | психология личности | личность человек психология - YouTube',
       'This game seems super spooky!',
       'бессонница | как бороться с бессонницей | как избавиться от бессонницы | причины бессонницы здоровье - YouTube',
       'как избавиться от стресса | борьба со стрессом | здоровье стрессоустойчивость - YouTube',
       'EXPLORING A SCARY CONSTRUCTION SITE (SCREAMING)!', 'horror',
       '2538189', '2537477', '2536670', '2534932', '2534332', '2532252',
       'panicdisorder', 'dread', 'bipolar', 'Joy', 'happiness', 'happy',
       '436768', '433104', '433100', 'Appreciation',
       'Has anyone else found that, during a depressive episode, you tend to appreciate your environmental surrou

In [10]:
# Will only take the rows with the correctly labelled 'subreddit' names.
emotions = ['Anger', 'exasperations', 'rage', 'vex', 
            'disgusting', 'awfuleverything', 'Yuck', 
            'fear', 'horror', 'panicdisorder', 'dread', 'bipolar', 
           'Joy', 'happiness', 'happy', 'Appreciation', 'ThankYou',
           'zen', 'ZenHabits', 'calm', 'Meditation', 
           'sad', 'Sadness', 'depression',
           'Surprise', 'Unexpected', 'Astonishing', 'Amazing', 'ShockingReality']

# Dropping the rows where the 'subreddit' rows are not those listed in emotions.
df = df[df['subreddit'].isin(emotions)]

In [14]:
# Creating new column for length of post.
df['post_length'] = df['post'].str.len()

In [15]:
# Classifying based on Eckhart's 7 emotional states.
df['emotion'] = df['subreddit'].map({'Anger' : 'Anger', 'exasperations': 'Anger', 'rage': 'Anger', 'vex': 'Anger', 
                'disgusting': 'Disgust', 'awfuleverything': 'Disgust', 'Yuck': 'Disgust', 
                'fear': 'Fear', 'horror': 'Fear', 'panicdisorder': 'Fear', 'dread': 'Fear', 'bipolar': 'Fear', 
                'Joy': 'Joy', 'happiness': 'Joy', 'happy': 'Joy', 'Appreciation': 'Joy', 'ThankYou': 'Joy', 
                'zen': 'Neutral', 'ZenHabits': 'Neutral', 'calm': 'Neutral', 'Meditation': 'Neutral', 
                'sad': 'Sadness', 'Sadness': 'Sadness', 'depression': 'Sadness', 
                'Surprise': 'Surprise', 'Unexpected': 'Surprise', 'Astonishing': 'Surprise', 'Amazing': 'Surprise', 'ShockingReality': 'Surprise'})

In [20]:
# Keeping only the columns that are necessary.
df = df[['emotion', 'author', 'post', 'post_length']]

In [21]:
# Save data to csv
df.to_csv('../datasets/cleaned_df.csv', index=False)