# Objective
- Merge both dataframes and change subreddit column to binary since it's our target variable 

In [1]:
import pandas as pd

In [2]:
#reading anxiety
anxiety = pd.read_csv('./datasets/unique_anxiety.csv')

In [3]:
#reading depression
depression = pd.read_csv('./datasets/unique_depression.csv')

### Let's merge selftext and title for Anxiety

In [4]:
anxiety.shape

(970, 5)

In [5]:
anxiety.head()

Unnamed: 0.1,Unnamed: 0,subreddit,selftext,title,created_utc
0,0,Anxiety,"Greetings /r/Anxiety,\n\n\nI apologize for the...",CBD Megathread (June 2019),1560125000.0
1,1,Anxiety,What have you accomplished this week? Share yo...,"Wednesday Wins - July 03, 2019",1562156000.0
2,2,Anxiety,I GOT AN OFFER!!!!!!\n\nI went back to pessimi...,[UPDATE] after almost canceling a job intervie...,1562108000.0
3,3,Anxiety,i have severe anxiety and it is really difficu...,i went to a bookstore and got a coffee by myse...,1562172000.0
4,4,Anxiety,This may seem like a small victory or just com...,I just managed to shift from a heart rate of 1...,1562172000.0


In [6]:
#lets gather selftext and title since we're only analyzing words 
anxiety['text'] = anxiety['selftext'] +  ' ' + anxiety['title']

In [7]:
anxiety['text'][0]

"Greetings /r/Anxiety,\n\n\nI apologize for the lack of a CBD megathread the past few months. We're going to start doing these again. A majority of you still wish to discuss CBD on the subreddit. Please keep in mind that we are a subreddit primarily for discussion and support for anxiety, and if you have certain questions that can't be answered here check out /r/CBD.\n\n\n\n\n\n\n\n\n**Ground Rules:**\n\n* Do not prescribe, or recommend CBD as a treatment. You should consult with your doctor about any treatment for a medical condition (i.e. anxiety and panic disorder). Discussion of experiences is fine, influencing others to use this substance with little research is not.\n* Do not promote brands, while discussion about your experience with certain brands will be allowed we do not want to see anyone pushing a brand regardless of intent.\n* Do not ask or tell where to buy. The legality of this substance is grey in many parts of the world. CBD Megathread (June 2019)"

### Let's merge selftext and title for Depression

In [8]:
depression.shape

(923, 5)

In [9]:
#lets gather selftext and title since we're only analyzing words 
depression['text'] = depression['selftext'] +  ' ' + depression['title']

In [10]:
depression['text'][0]

"We know that September 10 was World Suicide Prevention Day. And, we're all for activism. But not here, please and thank you. It takes focus away from our OPs in need of support and understanding.  Reminder: NO ACTIVISM is allowed here at any time."

In [11]:
depression.head()

Unnamed: 0.1,Unnamed: 0,subreddit,selftext,title,created_utc,text
0,0,depression,We know that September 10 was World Suicide Pr...,Reminder: NO ACTIVISM is allowed here at any t...,1536613000.0,We know that September 10 was World Suicide Pr...
1,1,depression,Welcome to /r/depression's check-in post - a p...,Regular Check-In Post,1549392000.0,Welcome to /r/depression's check-in post - a p...
2,2,depression,I’m going to the movies. I’m so nervous…,I’m willingly leaving my house for the first t...,1562121000.0,I’m going to the movies. I’m so nervous… I’m w...
3,3,depression,Now i can save so i can get myself out of this...,i finally got a job,1562151000.0,Now i can save so i can get myself out of this...
4,4,depression,I always do this. I’ll stay up until the wee h...,Does anyone else stay up super late to avoid b...,1562147000.0,I always do this. I’ll stay up until the wee h...


### For each dataframe, let's get `subreddit` and `text` in order to merge depression and anxiety

In [12]:
#depression
depression_features = ['subreddit', 'text']
depression_2 = depression[depression_features]
depression_2.head()

Unnamed: 0,subreddit,text
0,depression,We know that September 10 was World Suicide Pr...
1,depression,Welcome to /r/depression's check-in post - a p...
2,depression,I’m going to the movies. I’m so nervous… I’m w...
3,depression,Now i can save so i can get myself out of this...
4,depression,I always do this. I’ll stay up until the wee h...


In [13]:
depression_2.shape

(923, 2)

In [14]:
#anxiety
anxiety_features = ['subreddit', 'text']
anxiety_2 = anxiety[anxiety_features]
anxiety_2.head()

Unnamed: 0,subreddit,text
0,Anxiety,"Greetings /r/Anxiety,\n\n\nI apologize for the..."
1,Anxiety,What have you accomplished this week? Share yo...
2,Anxiety,I GOT AN OFFER!!!!!!\n\nI went back to pessimi...
3,Anxiety,i have severe anxiety and it is really difficu...
4,Anxiety,This may seem like a small victory or just com...


In [15]:
anxiety_2.shape

(970, 2)

## Time to merge both data frames

In [16]:
frames = [depression_2, anxiety_2]

df = pd.concat(frames)

In [17]:
df.shape

(1893, 2)

In [18]:
df.head()

Unnamed: 0,subreddit,text
0,depression,We know that September 10 was World Suicide Pr...
1,depression,Welcome to /r/depression's check-in post - a p...
2,depression,I’m going to the movies. I’m so nervous… I’m w...
3,depression,Now i can save so i can get myself out of this...
4,depression,I always do this. I’ll stay up until the wee h...


In [19]:
df.isnull().sum()

subreddit    0
text         2
dtype: int64

In [20]:
df.dropna(axis = 0, inplace = True)

In [21]:
df.shape

(1891, 2)

In [22]:
df.isnull().sum()

subreddit    0
text         0
dtype: int64

In [23]:
df['subreddit'].value_counts()

Anxiety       969
depression    922
Name: subreddit, dtype: int64

## Our df has non null values and contains both subreddits
- Let's make depression = 1 and anxiety = 0
- We'll export it as a csv called df.csv

In [24]:
df.head()

Unnamed: 0,subreddit,text
0,depression,We know that September 10 was World Suicide Pr...
1,depression,Welcome to /r/depression's check-in post - a p...
2,depression,I’m going to the movies. I’m so nervous… I’m w...
3,depression,Now i can save so i can get myself out of this...
4,depression,I always do this. I’ll stay up until the wee h...


In [25]:
#changing sbreddit column to binary variable 
df['subreddit'] = df['subreddit'].map({'depression': 1, 'Anxiety': 0})

In [26]:
df.head()

Unnamed: 0,subreddit,text
0,1,We know that September 10 was World Suicide Pr...
1,1,Welcome to /r/depression's check-in post - a p...
2,1,I’m going to the movies. I’m so nervous… I’m w...
3,1,Now i can save so i can get myself out of this...
4,1,I always do this. I’ll stay up until the wee h...


In [27]:
df['subreddit'].value_counts()

0    969
1    922
Name: subreddit, dtype: int64

In [28]:
df.to_csv('./datasets/df.csv')