## Imports

In [1]:
import os
import pandas as pd

## Loading the data

In [2]:
reddit = pd.read_csv('../data/reddit_clean.csv')

In [3]:
us = pd.read_csv('../data/us.csv')
us.set_index('date', inplace=True)

In [4]:
states = pd.read_csv('../data/us-states.csv')
states.set_index('date', inplace=True)

## Amalgamate us data

In [5]:
# Create dataframe of only reddit posts from CoronavirusUS
reddit_us = reddit.loc[reddit['subreddit'] == 'CoronavirusUS', :]

# Add subreddit column to us dataframe
us['subreddit'] = 'CoronavirusUS'

# Add total number of subreddit subscribers by date to us dataframe
us = us.merge(reddit_us.groupby('date')[['subreddit_subscribers']].max(), 
              left_index=True,
              right_index=True)

# Add total number of subreddit posts by date to us dataframe
us = us.merge(reddit_us.groupby('date')[['title']].count(),
              left_index=True,
              right_index=True).rename(columns={'title': 'num_posts'})

# Add total number of subreddit comments by date to us dataframe
us = us.merge(reddit_us.groupby('date')[['num_comments']].sum(), 
              left_index=True,
              right_index=True)

# Add total score of subreddit posts by date to us dataframe
us = us.merge(reddit_us.groupby('date')[['score']].sum(), 
              left_index=True,
              right_index=True).rename(columns={'score': 'tot_score'})

## Amalgamate state data

In [6]:
# Create dict of states and their corresponding subreddits
states_and_subreddits = {
    'Pennsylvania': 'CoronaVirusPA',
    'South Carolina': 'coronavirusSC',
    'New Jersey': 'CoronavirusNJ',
    'Alabama': 'CoronavirusAlabama',
    'Virginia': 'coronavirusVA',
    'West Virginia': 'CoronaVirusWV',
    'California': 'CoronavirusCA',
    'Florida': 'FloridaCoronavirus',
    'New York': 'CoronavirusNewYork',
    'Texas': 'CoronaVirusTX',
    'Washington': 'CoronavirusWA',
    'Michigan': 'CoronavirusMichigan',
    'Colorado': 'CoronavirusColorado',
    'Illinois': 'CoronavirusIllinois',
    'Louisiana': 'Coronaviruslouisiana',
    'Massachusetts': 'CoronavirusMa',
    'Georgia': 'CoronavirusGA',
}

In [7]:
state_dataframes = []

for state in states_and_subreddits.keys():
    # Create dataframe of only reddit posts from the given state
    state_reddit = reddit.loc[reddit['subreddit'] == states_and_subreddits[state], :]

    # Create dataframe of only cases and deaths from the given state.
    state_df = states.loc[states['state'] == state, :]
    
    # Add subreddit column to state dataframe
    state_df['subreddit'] = states_and_subreddits[state]

    # Add total number of subreddit subscribers by date to state dataframe
    state_df = state_df.merge(state_reddit.groupby('date')[['subreddit_subscribers']].max(), 
                              left_index=True,
                              right_index=True)

    # Add total number of subreddit posts by date to state dataframe
    state_df = state_df.merge(state_reddit.groupby('date')[['title']].count(), 
                              left_index=True,
                              right_index=True).rename(columns={'title': 'num_posts'})

    # Add total number of subreddit comments by date to state dataframe
    state_df = state_df.merge(state_reddit.groupby('date')[['num_comments']].sum(), 
                              left_index=True,
                              right_index=True)

    # Add total score of subreddit posts by date to us dataframe
    state_df = state_df.merge(state_reddit.groupby('date')[['score']].sum(), 
                              left_index=True,
                              right_index=True).rename(columns={'score': 'tot_score'})
    
    state_dataframes.append(state_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [12]:
df = pd.concat(state_dataframes)

In [17]:
df[df['state'] == 'West Virginia'].tail()

Unnamed: 0_level_0,state,fips,cases,deaths,subreddit,subreddit_subscribers,num_posts,num_comments,tot_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-04-30,West Virginia,54,1125,44,CoronaVirusWV,477,1,1,1
2020-05-04,West Virginia,54,1224,50,CoronaVirusWV,491,2,14,2
2020-05-05,West Virginia,54,1242,50,CoronaVirusWV,493,3,9,3
2020-05-06,West Virginia,54,1276,51,CoronaVirusWV,503,2,6,2
2020-05-07,West Virginia,54,1297,51,CoronaVirusWV,515,1,3,2


## Export data

In [8]:
try:
    os.mkdir('../data')
except FileExistsError:
    pass
finally:
    try:
        os.mkdir('../data/states')
    except FileExistsError:
        pass

In [9]:
us.to_csv('../data/us_with_reddit.csv')

In [10]:
for state_df in state_dataframes:
    filename = state_df['state'][0].lower().replace(' ', '_') + '.csv'
    state_df.to_csv(f'../data/states/{filename}')

In [11]:
pd.concat(state_dataframes).to_csv('../data/states_with_reddit.csv')