# Cleaning Depression Related Data

## Importing Extracted Data 

In [1]:
import pandas as pd
import re

In [2]:
df_1 = pd.read_csv("dep_sadness.csv")
df_2 = pd.read_csv("dep_depressed.csv")
df_3 = pd.read_csv("dep_loneliness.csv")
df_4 = pd.read_csv("dep_depression.csv")


# Combining Datasets.
df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True, axis='rows')

In [3]:
print("Shape:",df.shape)

print("Unique tweets:", len(df['Tweet Id'].value_counts()))

Shape: (16004, 8)
Unique tweets: 15717


In [4]:
# Dropping duplicate tweets using "ID" column.

df = df.drop_duplicates(subset =["Tweet Id"])

print("Shape:",df.shape)

Shape: (15717, 8)


In [5]:
# For properly viewing the dataframe columns
pd.set_option('display.max_colwidth', 100)

df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Username,Text,URL,Mention,Hashtags
0,0,2020-12-30 23:05:25+00:00,1344419347524165638,seesawlesbian,“bts is a trend” #bts #btsarmy #btsbts #hashtag #kpop #kpophashtag #btsisatrendhashtag #bighitim...,https://twitter.com/seesawlesbian/status/1344419347524165638,,"['bts', 'btsarmy', 'btsbts', 'hashtag', 'kpop', 'kpophashtag', 'btsisatrendhashtag', 'bighitimgo..."
1,1,2020-12-30 22:51:23+00:00,1344415815249113090,That_Guy_Crash,What it feels like to lose in Mario Kart #mariokart #mariokart8 #loser #lost #sad #sadness #twit...,https://twitter.com/That_Guy_Crash/status/1344415815249113090,,"['mariokart', 'mariokart8', 'loser', 'lost', 'sad', 'sadness', 'twitch', 'twitchtv', 'twitchclip..."
2,2,2020-12-30 22:48:18+00:00,1344415040653422593,MIDAGEDRUNNER,@Lunker58Steele #sadness,https://twitter.com/MIDAGEDRUNNER/status/1344415040653422593,"[User(username='Lunker58Steele', id=707050254, displayname='William or Mike', description=None, ...",['sadness']
3,3,2020-12-30 22:39:13+00:00,1344412753314902022,MBCharacter,https://t.co/X0T941sg7a\n#help #covid_19 #isolation #sadness #grief #mbc #hope #difference #yout...,https://twitter.com/MBCharacter/status/1344412753314902022,,"['help', 'covid_19', 'isolation', 'sadness', 'grief', 'mbc', 'hope', 'difference', 'youth', 'sel..."
4,4,2020-12-30 21:58:28+00:00,1344402501307564032,rebeccajchaney,"Like everyone, I have so many memories from this year. One stands out for me: the loss of a dear...",https://twitter.com/rebeccajchaney/status/1344402501307564032,"[User(username='rebeccajchaney', id=2842089636, displayname='Rebecca Chaney', description=None, ...","['memory', 'reflection', 'Reflection2020', 'cancer', 'sadness', 'love', 'hope', 'WritingCommunity']"


In [6]:
# The scraper has scraped URLs of the tweets and has not scraped URLs that are in the tweets. We will
# need to drop the row URL and will have to find a way to find URLs from the text.

df.drop(['URL'], axis='columns', inplace=True)

print("Shape:", df.shape)

Shape: (15717, 7)


## Filtering Data

### 1- Remove entries containing urls, they may be promotional tweets.

In [7]:
s = ['https:', 'Https:']
df = df[~df.Text.str.contains('|'.join(s))]
print("Shape:", df.shape)

Shape: (6107, 7)


### 2- Remove entries with at mentions.

In [8]:
df = df.loc[pd.isnull(df['Mention'])]
df.shape

(3854, 7)

In [9]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Username,Text,Mention,Hashtags
13,13,2020-12-30 19:48:33+00:00,1344369806825451525,ChristyRoxy,"À force de trop ouvrir son cœur au gens qui ne nous considèrent pas, dire toute ses pensées et s...",,"['BetrayalAct', 'betrayal', 'sadness']"
23,23,2020-12-30 17:30:10+00:00,1344334980252655616,nobleregulus,not a single comment on my recent fic yet.. 😖😣😔 #sadness,,['sadness']
24,24,2020-12-30 16:37:20+00:00,1344321684149391360,rex_seem,Se me quebró tan feo la pantalla del tel que no puedo ver sus tristes twits ..\n#sad #sadtwits #...,,"['sad', 'sadtwits', 'sadness', 'enigma']"
28,28,2020-12-30 15:28:41+00:00,1344304406536269825,mariasophiemegn,its so sad when u take down all the christmas decorations \n#sad #sadness #Christmas #christmasi...,,"['sad', 'sadness', 'Christmas', 'christmasisover']"
30,30,2020-12-30 14:05:12+00:00,1344283398177779714,ClosetsWidows,"If love could have saved you, I know you would still be here with me.\n#grief #sadness #widowed...",,"['grief', 'sadness', 'widowed', 'widows', 'death', 'loss', 'bereaved', 'love', 'losingalovedone'..."
33,33,2020-12-30 12:42:32+00:00,1344262594694172672,BookSnip,"too much happiness always overflowed into tears of sorrow.\n\n-Amy Tan, The Hundred Secret Sense...",,"['happiness', 'sadness', 'crying']"
36,36,2020-12-30 11:48:49+00:00,1344249078398345218,gotkashi,@erenyeagrr #PAIN #SADNESS #SORROW,,"['PAIN', 'SADNESS', 'SORROW']"
44,44,2020-12-30 06:55:25+00:00,1344175239358341120,Isla_Plastic18,Sometimes it takes a little bit of sadness to know what happiness is. #Islabot #Sadness #Happiness,,"['Islabot', 'Sadness', 'Happiness']"
46,46,2020-12-30 05:49:04+00:00,1344158544484941824,linzmwilliams,Sad that friendships are fractured because of the isolation of COVID. How do you help your child...,,"['friendship', 'sadness', 'COVID19', 'scared']"
49,49,2020-12-30 04:55:55+00:00,1344145166165131266,ariadnedreams,I have an IV drip of sorrow \nI keep thinking I’ve run out\nBut my eyes keep refilling the drip\...,,"['poem', 'sadness']"


## Make tweet column hastag free.

In [10]:
df["Text"] = df["Text"].apply(lambda x: re.sub(r'#\w+', '', x))

In [11]:
df["Depression"] = 1
df.drop(['Tweet Id', 'Mention', 'Hashtags', 'Username', 'Datetime', 'Unnamed: 0'],
        axis='columns', inplace=True)
print(df.head(5))

                                                                                                   Text  \
13  À force de trop ouvrir son cœur au gens qui ne nous considèrent pas, dire toute ses pensées et s...   
23                                                     not a single comment on my recent fic yet.. 😖😣😔    
24                  Se me quebró tan feo la pantalla del tel que no puedo ver sus tristes twits ..\n      
28                                      its so sad when u take down all the christmas decorations \n      
30                    If love could have saved you, I know you would still be here with me.\n             

    Depression  
13           1  
23           1  
24           1  
28           1  
30           1  


In [12]:
df.to_csv("Depressed.csv")

# Cleaning Non-Depression Data

## Importing Extracted Data 

In [13]:
df_1 = pd.read_csv("non_dep_happy.csv")
df_2 = pd.read_csv("non_dep_selflove.csv")
df_3 = pd.read_csv("non_dep_positivevibes.csv")
df_4 = pd.read_csv("non_dep_inspiration.csv")


# Combining Datasets.
df_non = pd.concat([df_1, df_2, df_3, df_4],ignore_index=True,axis='rows')

In [14]:
print("Shape:", df_non.shape)

# For observing duplicate tweets we are using 'ID'.
print("Unique tweets:", len(df_non['Tweet Id'].value_counts()))

Shape: (16004, 8)
Unique tweets: 15143


In [15]:
# Dropping duplicate tweets using "ID" column.

df_non = df_non.drop_duplicates(subset =["Tweet Id"])

print("Shape:",df_non.shape)

Shape: (15143, 8)


In [16]:
# For properly viewing the dataframe columns

pd.set_option('display.max_colwidth', 100)
print("Shape:", df_non.shape)
df_non.head()

Shape: (15143, 8)


Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Username,Text,URL,Mention,Hashtags
0,0,2020-12-30 23:59:58+00:00,1344433078320521217,txtraveltegal,#love #TFLers #instagood #tweegram #photooftheday #me #instamood #cute #iphonesia #summer #tbt ...,https://twitter.com/txtraveltegal/status/1344433078320521217,,"['love', 'TFLers', 'instagood', 'tweegram', 'photooftheday', 'me', 'instamood', 'cute', 'iphones..."
1,1,2020-12-30 23:59:31+00:00,1344432963954438146,pr_deciel,あっという間に、今年最後の日＝大晦日となりました\n\nやり残したことはありませんか？\n\nデシェルの「福袋」買いましたか？\n\nたいへん、夢に出ちゃいますよｗ\n\n年末年始だけの数量限...,https://twitter.com/pr_deciel/status/1344432963954438146,,"['福袋', 'HAPPY', '限定', '2021年', '新春', '化粧品', '夢に出る', 'やり残し', '数量限定']"
2,2,2020-12-30 23:59:13+00:00,1344432887035256834,braintickler_,JOKE OF THE DAY : Whoever invented the knock-knock joke should get a no bell prize. #humor #meme...,https://twitter.com/braintickler_/status/1344432887035256834,,"['humor', 'memesdaily', 'rofl', 'jokeoftheday', 'funny', 'lol', 'lmao', 'happy', 'followme', 'lo..."
3,3,2020-12-30 23:59:07+00:00,1344432862381051904,PandaMartini,"Dear writing community, I want a #penpal. I have #stickers, a #cricut, and lots of #stationary. ...",https://twitter.com/PandaMartini/status/1344432862381051904,,"['penpal', 'stickers', 'cricut', 'stationary', 'cute', 'gifts', 'notgoingout', 'Lonely', 'spread..."
4,4,2020-12-30 23:57:15+00:00,1344432391683715072,Rhaulli_Panda,#merrychristmas🎄 #merryxmas #happy #newyear #happynewyear #happynewyear2021 #2021 #2021春婚 #dicie...,https://twitter.com/Rhaulli_Panda/status/1344432391683715072,,"['merrychristmas', 'merryxmas', 'happy', 'newyear', 'happynewyear', 'happynewyear2021', '2021春婚'..."


In [17]:
df_non.drop(['URL'], axis='columns', inplace=True)

print("Shape:", df_non.shape)

Shape: (15143, 7)


## Filtering Data

### 1- Remove entries containing urls, they may be promotional tweets.

In [18]:
df_non = df_non[~df_non.Text.str.contains('|'.join(s))]
print("Shape:", df_non.shape)

Shape: (3501, 7)


### 2- Remove entries with at mentions.

In [19]:
df_non = df_non.loc[pd.isnull(df_non['Mention'])]
df_non.shape

(2789, 7)

In [20]:
df_non.head(6)

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Username,Text,Mention,Hashtags
2,2,2020-12-30 23:59:13+00:00,1344432887035256834,braintickler_,JOKE OF THE DAY : Whoever invented the knock-knock joke should get a no bell prize. #humor #meme...,,"['humor', 'memesdaily', 'rofl', 'jokeoftheday', 'funny', 'lol', 'lmao', 'happy', 'followme', 'lo..."
3,3,2020-12-30 23:59:07+00:00,1344432862381051904,PandaMartini,"Dear writing community, I want a #penpal. I have #stickers, a #cricut, and lots of #stationary. ...",,"['penpal', 'stickers', 'cricut', 'stationary', 'cute', 'gifts', 'notgoingout', 'Lonely', 'spread..."
11,11,2020-12-30 23:50:02+00:00,1344430578674798594,ayatan0305,2021年楽しみすぎるな〜\n色んなことが起きて\n色んな感情にもなるし\n色んな人と出会うでしょう☺️\n\n自分を大切に.\n自分を信じて.\n自分に喜ぶ事をやろうね❤️\nフリーダムにシ...,,"['yoga', 'yogalife', 'happy']"
24,24,2020-12-30 23:42:58+00:00,1344428798624878592,ahmedsamiirr1,#Happy _new _year ♥♥,,['Happy']
27,27,2020-12-30 23:41:45+00:00,1344428491647889417,MasugzyoN,Some of us had to make it on our own #Happy New year Eve to all of us,,['Happy']
34,34,2020-12-30 23:32:33+00:00,1344426176408932354,baalians,#happy,,['happy']


## Make tweet column hastag free.

In [21]:
df_non["Text"] = df_non["Text"].apply(lambda x: re.sub(r'#\w+', '', x))

In [22]:
df_non["Depression"] = 0
df_non.drop(['Tweet Id', 'Mention', 'Hashtags', 'Username', 'Datetime', 'Unnamed: 0'],
           axis='columns', inplace=True)
df_non.head(5)

Unnamed: 0,Text,Depression
2,JOKE OF THE DAY : Whoever invented the knock-knock joke should get a no bell prize.,0
3,"Dear writing community, I want a . I have , a , and lots of . DM me? I also will send 🥰 Over 1...",0
11,2021年楽しみすぎるな〜\n色んなことが起きて\n色んな感情にもなるし\n色んな人と出会うでしょう☺️\n\n自分を大切に.\n自分を信じて.\n自分に喜ぶ事をやろうね❤️\nフリーダムにシ...,0
24,_new _year ♥♥,0
27,Some of us had to make it on our own New year Eve to all of us,0


In [23]:
df_non.to_csv("NotDepressed.csv")