# Cleaning

## Imports

In [1]:
import os
import pandas as pd
import time

## Loading the data

In [2]:
df = pd.read_csv('../data/reddit_raw.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,upvote_ratio,suggested_sort,link_flair_css_class,banned_by,gilded,steward_reports,event_end,event_is_live,event_start,discussion_type
0,[],False,rfwaverider,,[],,text,t2_x95jn,False,False,...,,,,,,,,,,
1,[],False,rb30zk,,[],,text,t2_55zy27j9,False,False,...,,,,,,,,,,
2,[],False,rb30zk,,[],,text,t2_55zy27j9,False,False,...,,,,,,,,,,
3,[],False,rb30zk,,[],,text,t2_55zy27j9,False,False,...,,,,,,,,,,
4,[],False,rb30zk,,[],,text,t2_55zy27j9,False,False,...,,,,,,,,,,


In [4]:
df.shape

(209994, 87)

## Clean the raw dataframe

In [5]:
# Overwrite dataframe to be only the columns of potential interest
features = ['subreddit', 
            'subreddit_subscribers', 
            'title', 
            'selftext', 
            'score', 
            'num_comments', 
            'author', 
            'created_utc']
df = df[features]

In [6]:
# Select only the subreddits related to geographic regions
subreddits = [
    'CoronavirusUS',
    'CoronavirusCA',
    'CoronavirusWA',
    'CoronaVirusTX',
    'CoronavirusNewYork',
    'CoronavirusGA',
    'FloridaCoronavirus',
    'CoronavirusMichigan',
    'CoronavirusColorado',
    'CoronavirusMa',
    'CoronavirusAlabama',
    'CoronavirusIllinois',
    'Coronaviruslouisiana',
    'CoronaVirusPA',
    'coronavirusSC',
    'coronavirusVA',
    'CoronaVirusWV',
    'CoronavirusNJ'
]
df = df.loc[df['subreddit'].isin(subreddits), :]

In [7]:
# Convert utc column to regular date format
def get_date_from_utc(utc_time):
    local_time = time.localtime(utc_time)
    return str(local_time.tm_year) + '-' + str(local_time.tm_mon) + '-' + str(local_time.tm_mday)

df['date'] = pd.to_datetime(df['created_utc'].map(get_date_from_utc))

In [10]:
df.dtypes

subreddit                        object
subreddit_subscribers             int64
title                            object
selftext                         object
score                             int64
num_comments                      int64
author                           object
created_utc                       int64
date                     datetime64[ns]
dtype: object

## Exporting the clean data

In [8]:
try:
    os.mkdir('../data')
except FileExistsError:
    pass

In [9]:
df.to_csv(f'../data/reddit_clean.csv', index=False)