In [1]:
# import libraries

import pandas as pd
import numpy as np

import os
from datetime import datetime 

### 1. Read in the datasets from downloaded 
### 2. Drop the unnecessary columns
> For this project's purpose, I decided to use tfidfvectorizer. That is, because countvectorizer only counts how often the word has been used; however, tfidfvectorizer weighs the importance of the words. For this reason, I think tfidfvectorizer would give me the insight of how the usage of the same words has changed over time, and that it would be useful for the trend analysis - I will be keeping the columns with this information.
### 3. Concatenate the datasets by subreddit
### 4. Save to CSV file

In [2]:
pd.read_csv('../data/downloaded/alcoholism_post_features_tfidf_256.csv').head()

Unnamed: 0,subreddit,author,date,post,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,alcoholism,glorybellpirate,2020/01/01,Day 1 of sobriety Feeling anxious and letting ...,4.275833,7.690396,4.71,73.168333,86.5,8.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,alcoholism,EhndlessSl0th,2020/01/01,"Started the New Year with a bang. Hey, I'm new...",1.541784,3.148868,2.806878,95.317853,78.593496,5.552354,...,0.0,0.071768,0.0,0.0,0.0,0.0,0.0,0.081401,0.0,0.143832
2,alcoholism,the_kinky_penguin,2020/01/01,Why can't I get drunk anymore I've been a heav...,-0.233695,1.917433,1.475911,99.238793,89.0,4.003941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.178032,0.0,0.0,0.16139
3,alcoholism,SauceoffSauceOn,2020/01/01,I am an Alcoholic. How do I quit? I have been ...,1.819821,4.628757,3.678036,80.90625,90.666667,6.909524,...,0.0,0.0,0.0,0.0,0.0,0.0,0.245538,0.0,0.0,0.111293
4,alcoholism,ben42187,2020/01/01,Funniest Thing about Alcoholism With every oth...,4.282,4.962135,5.108333,85.155,70.333333,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102762


In [5]:
# named similarly, create a function to interate through the files in the folder and create a dataset

subr = ['alcoholism', 'suicidewatch', 'mentalhealth', 'lonely', 'healthanxiety', 'depression', 'bipolarreddit', 'anxiety']

def generate_df(ls):

    for element in ls:
    
        file1 = f'../data/downloaded/{element}_pre_features_tfidf_256.csv'
        file2 = f'../data/downloaded/{element}_post_features_tfidf_256.csv'
        
        # combine pre and post
        df = pd.concat([pd.read_csv(file1), pd.read_csv(file2)])
        
        # drop the columns that I will not be using 
        # -> mostly scores that are not reproduceable on my own, including Linguistic Inquiry Word Counts (liwc)
        # using different word embedding and vectorizer --> remove tf-idf features
        df = df[['subreddit', 'author', 'date', 'post']]
        
        # store the datasets to csv files
        df.to_csv(f'../data/cleaned/{element}.csv', index = False)
        
    return

In [6]:
# call in the function

generate_df(subr)

In [8]:
# check the data

df = pd.read_csv('../data/cleaned/alcoholism.csv')

In [9]:
df.head()

Unnamed: 0,subreddit,author,date,post
0,alcoholism,RobynTacoo,2019/10/19,My husband offered me a glass of wine I turned...
1,alcoholism,wolsinyourarea,2019/10/19,Feeling like a failure Broke my 2 week streak ...
2,alcoholism,glitterONeverything,2019/10/19,help! withdrawals are crazy!! I feel lije shit...
3,alcoholism,Collector420,2019/10/19,God damn I (M17) wrote messages to almost 20 p...
4,alcoholism,engineerkoala,2019/10/19,How to have as much fun as before Today was my...


In [10]:
df.isnull().sum()

subreddit    0
author       0
date         0
post         0
dtype: int64

In [11]:
# check another dataset

df = pd.read_csv('../data/cleaned/lonely.csv')

In [12]:
df.head()

Unnamed: 0,subreddit,author,date,post
0,lonely,mord0r,2019/09/28,Having a bad day. I have been on a mania high ...
1,lonely,d1ng_d0ngz,2019/09/28,looking for some chatting buddies might be a l...
2,lonely,trynabhappie,2019/09/28,The flame is going out little by little! 😊 I o...
3,lonely,Mutantti2,2019/09/28,Things wrong with my life I want to tell about...
4,lonely,LegalCelery,2019/09/28,I want to fall in love I dont really believe i...


In [13]:
# make a list of files to combine all the datasets

path = '../data/cleaned/'
files = []

for element in subr:
    files.append(path + element + '.csv')
    
files

['../data/cleaned/alcoholism.csv',
 '../data/cleaned/suicidewatch.csv',
 '../data/cleaned/mentalhealth.csv',
 '../data/cleaned/lonely.csv',
 '../data/cleaned/healthanxiety.csv',
 '../data/cleaned/depression.csv',
 '../data/cleaned/bipolarreddit.csv',
 '../data/cleaned/anxiety.csv']

In [14]:
# concat

df = pd.concat([pd.read_csv(file) for file in files], axis = 0, ignore_index = True)

In [15]:
# check the dataset again

print(df.shape)
df.head()

(203691, 4)


Unnamed: 0,subreddit,author,date,post
0,alcoholism,RobynTacoo,2019/10/19,My husband offered me a glass of wine I turned...
1,alcoholism,wolsinyourarea,2019/10/19,Feeling like a failure Broke my 2 week streak ...
2,alcoholism,glitterONeverything,2019/10/19,help! withdrawals are crazy!! I feel lije shit...
3,alcoholism,Collector420,2019/10/19,God damn I (M17) wrote messages to almost 20 p...
4,alcoholism,engineerkoala,2019/10/19,How to have as much fun as before Today was my...


In [17]:
# check the date column

df['date'].dtype #--> should be set to datetime field

dtype('O')

In [18]:
# convert date column to datetime

df['date'] = pd.to_datetime(df['date'])

In [19]:
# check the date column

df['date'].dtype

dtype('<M8[ns]')

In [21]:
# keep only 2020

df = df[df['date'] > '2019/12/31']

In [22]:
# check the date

df['date'].min()

Timestamp('2020-01-01 00:00:00')

In [23]:
# store the combined dataset to a csv file

df.to_csv('../data/cleaned/combined.csv', index = False)