In [1]:
# import libraries

import pandas as pd
import numpy as np

import os
from datetime import datetime 

### 1. Read in the datasets from downloaded 
### 2. Drop the unnecessary columns
> For this project's purpose, I decided to use tfidfvectorizer. That is, because countvectorizer only counts how often the word has been used; however, tfidfvectorizer weighs the importance of the words. For this reason, I think tfidfvectorizer would give me the insight of how the usage of the same words has changed over time, and that it would be useful for the trend analysis - I will be keeping the columns with this information.
### 3. Concatenate the datasets by subreddit
### 4. Save to CSV file

In [2]:
# named similarly, create a function to interate through the files in the folder and create a dataset

subr = ['alcoholism', 'suicidewatch', 'mentalhealth', 'lonely', 'healthanxiety', 'depression', 
        'bpd', 'bipolarreddit', 'anxiety']

def generate_df(ls):

    for element in ls:
    
        file1 = f'/Users/juhee/Desktop/GA/Submissions/Capstone/data/downloaded/{element}_pre_features_tfidf_256.csv'
        file2 = f'/Users/juhee/Desktop/GA/Submissions/Capstone/data/downloaded/{element}_post_features_tfidf_256.csv'
        
        # combine pre and post
        df = pd.concat([pd.read_csv(file1), pd.read_csv(file2)])
        
        # drop the columns that I will not be using 
        # -> mostly scores that are not reproduceable on my own, including Linguistic Inquiry Word Counts (liwc)
        # keep tf-idf vectorized columns to reduce processing time due to the size of the datasets 
        df.drop(columns = list(df.columns[4:94]), axis = 1, inplace = True)
        
        # store the datasets to csv files
        df.to_csv(f'/Users/juhee/Desktop/GA/Submissions/Capstone/data/cleaned/{element}.csv', index = False)
        
    return

In [3]:
# call in the function

generate_df(subr)

In [4]:
# check the data

df = pd.read_csv('/Users/juhee/Desktop/GA/Submissions/Capstone/data/cleaned/bpd.csv')

In [5]:
df.head()

Unnamed: 0,subreddit,author,date,post,tfidf_abl,tfidf_abus,tfidf_actual,tfidf_addict,tfidf_adhd,tfidf_advic,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,bpd,madamegeoffrin,2019/03/14,"Being Alone Hi all,\n\nI live alone and am hav...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bpd,chikkinnugget,2019/03/14,I don’t know what to do anymore. I have been w...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.102024,0.0,0.0,0.0,0.0,0.0,0.090246
2,bpd,Nonsense_constance,2019/03/14,Relapsing. Can't stop obsessing over ex. I'm d...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.255244,0.178028,0.0,0.124529,0.0,0.0,0.0
3,bpd,sras5462,2019/03/14,I think I have Bpd and it scares me A little ...,0.0,0.0,0.0,0.0,0.0,0.130657,...,0.0,0.064444,0.0,0.04867,0.0,0.0,0.142472,0.073094,0.0,0.172206
4,bpd,TrAiLeRpArKgIrLl,2019/03/14,Does anyone else actually like themselves or a...,0.0,0.0,0.292891,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.213209,0.0,0.0,0.0


In [6]:
df.isnull().sum()

subreddit      0
author         0
date           0
post           0
tfidf_abl      0
              ..
tfidf_wors     0
tfidf_would    0
tfidf_wrong    0
tfidf_x200b    0
tfidf_year     0
Length: 260, dtype: int64

In [9]:
# check another dataset

df = pd.read_csv('/Users/juhee/Desktop/GA/Submissions/Capstone/data/cleaned/lonely.csv')

In [10]:
df.head()

Unnamed: 0,subreddit,author,date,post,tfidf_abl,tfidf_abus,tfidf_actual,tfidf_addict,tfidf_adhd,tfidf_advic,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,lonely,mord0r,2019/09/28,Having a bad day. I have been on a mania high ...,0.0,0.0,0.225583,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,lonely,d1ng_d0ngz,2019/09/28,looking for some chatting buddies might be a l...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,lonely,trynabhappie,2019/09/28,The flame is going out little by little! 😊 I o...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125035,0.0,0.073644
3,lonely,Mutantti2,2019/09/28,Things wrong with my life I want to tell about...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.276474,0.096418,0.0,0.134887,0.103804,0.0,0.061139
4,lonely,LegalCelery,2019/09/28,I want to fall in love I dont really believe i...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# make a list of files to combine all the datasets

path = '../data/cleaned/'
files = []

for element in subr:
    files.append(path + element + '.csv')
    
files

['../data/cleaned/alcoholism.csv',
 '../data/cleaned/suicidewatch.csv',
 '../data/cleaned/mentalhealth.csv',
 '../data/cleaned/lonely.csv',
 '../data/cleaned/healthanxiety.csv',
 '../data/cleaned/depression.csv',
 '../data/cleaned/bpd.csv',
 '../data/cleaned/bipolarreddit.csv',
 '../data/cleaned/anxiety.csv']

In [17]:
# concat

df = pd.concat([pd.read_csv(file) for file in files], axis = 0, ignore_index = True)

In [18]:
# check the dataset again

print(df.shape)
df.head()

(220671, 260)


Unnamed: 0,subreddit,author,date,post,tfidf_abl,tfidf_abus,tfidf_actual,tfidf_addict,tfidf_adhd,tfidf_advic,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,alcoholism,RobynTacoo,2019/10/19,My husband offered me a glass of wine I turned...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.479482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,alcoholism,wolsinyourarea,2019/10/19,Feeling like a failure Broke my 2 week streak ...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,alcoholism,glitterONeverything,2019/10/19,help! withdrawals are crazy!! I feel lije shit...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,alcoholism,Collector420,2019/10/19,God damn I (M17) wrote messages to almost 20 p...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,alcoholism,engineerkoala,2019/10/19,How to have as much fun as before Today was my...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.337441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# store the combined dataset to a csv file

df.to_csv('../data/cleaned/combined.csv', index = False)