## Notebook 2 - Combine Data and Add Columns
The purpose of this notebook is twofold:  
1. Add some features to the data - country and individual date components: year, month, hour and day of week (where 0 = Monday, 1 = Tuesday, etc.). 
1. Join all the dataframes into one.

In [None]:
import pandas as pd

In [None]:
# this adjusts the hour for time zone difference since API gives it in UTC
def adjust_hour(hour, offset):
    if hour >= offset:
        hour -= offset
    else:
        hour += 24 - offset
    return hour

In [None]:
# splits the datetime column into separate features
def add_date_features(df, offset=0):
    df['year'] = df.date.apply(lambda x: x.year)
    df['month'] = df.date.apply(lambda x: x.month)
    df['weekday'] = df.date.apply(lambda x: x.weekday())
    hours = df.date.apply(lambda x: x.hour)
    df['local_hour'] = hours.apply(adjust_hour, args=(offset,))
    
    return df

In [None]:
df_sfspca = pd.read_pickle('../data/sfspca_tweets.p')
df_pspca = pd.read_pickle('../data/pspca_tweets.p')
df_houston = pd.read_pickle('../data/houston_tweets.p')
df_texas = pd.read_pickle('../data/texas_tweets.p')
df_tulsa = pd.read_pickle('../data/tulsa_tweets.p')
df_richmond = pd.read_pickle('../data/richmond_tweets.p')
df_ontario = pd.read_pickle('../data/ontario_tweets.p')
df_alberta = pd.read_pickle('../data/alberta_tweets.p')
df_bc = pd.read_pickle('../data/bc_tweets.p')

In [None]:
df_sfspca = add_date_features(df_sfspca, 7)
df_pspca = add_date_features(df_pspca, 4)
df_houston = add_date_features(df_houston, 5)
df_texas = add_date_features(df_texas, 5)
df_tulsa = add_date_features(df_tulsa, 5)
df_richmond = add_date_features(df_richmond, 4)
df_ontario = add_date_features(df_ontario, 4)
df_alberta = add_date_features(df_alberta, 6)
df_bc = add_date_features(df_bc, 7)

In [None]:
df_usa = pd.concat([df_sfspca, df_pspca, df_houston, df_texas, df_tulsa, df_richmond], ignore_index=True)
df_canada = pd.concat([df_ontario, df_alberta, df_bc], ignore_index=True)

In [None]:
# add country to each dataframe
df_usa['country'] = 'usa'
df_canada['country'] = 'canada'
df_usa.shape, df_canada.shape

In [None]:
df_usa.to_pickle('../data/usa_tweets.p')
df_canada.to_pickle('../data/canada_tweets.p')

In [None]:
df = pd.concat([df_usa, df_canada], ignore_index=True)
df.shape

In [None]:
df.to_pickle('../data/all_tweets.p')