In [1]:
import os
import json


import matplotlib.pyplot as plt
import matplotlib.patches as patch
%matplotlib inline


import pandas as pd
import numpy as np

# Loading data and preprocessing

In [56]:
%%time
df = pd.DataFrame.from_csv("./labeled.control.dump.csv")
df["time"] = pd.to_datetime(df.created_at)
df = df.set_index("time")

CPU times: user 2min 18s, sys: 907 ms, total: 2min 19s
Wall time: 2min 19s


In [None]:
path = "/Users/JasonLiu/dump/predicted/"
files = os.listdir(path)

df = pd.concat(map(pd.read_csv, [path+file for file in files[1:]]))
df["time"] = pd.to_datetime(df.time)

In [57]:
df["fp"] = df["prediction_alcohol_svc"] * df["prediction_firstperson_svc"]

col = ["prediction_firstperson_level_0", "prediction_firstperson_level_2", "prediction_firstperson_level_3"]
new_fp_cols = ["casual", "looking", "reflecting"]
for new_name, old_name in zip(new_fp_cols, col):
    df[new_name] = df[old_name] * df.prediction_alcohol_svc * df.prediction_firstperson_svc

In [58]:
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

states_inverted = {v:k for (k,v) in states.items()}

In [59]:
def map2name(s):
    if "USA" in s:
        state = s[:-5]
        if state in states_inverted:
            return state
        else:
            return "<Other>"
    try:
        state_code = s[-2:]
        return states[state_code]
    except:
        return "<Other>"
    return "<Other>"

In [63]:
df["location"] = df.place_fullname.astype(str).apply(map2name)

# Alcohol Dependence in the Past Year, by Age Group and State: Percentages, Annual Averages Based on 2013 and 2014 NSDUHs 

In [64]:
alcohol_depend = pd.read_csv("./academic_data/alcohol_dependence.csv").set_index("State")

In [65]:
dep = alcohol_depend["18 or Older\rEstimate"].apply(lambda _: _[:4]).astype(float)

In [66]:
location = df.groupby("location")

In [67]:
dep_fp = location.agg({"fp":"mean"})

In [68]:
temp = dep_fp.join(dep)
temp.columns = ["predicted", "measured"]

In [69]:
import seaborn as sns

In [76]:
temp.sort("predicted")["predicted"].head()

location
South Dakota            0.059812
Nebraska                0.086705
District of Columbia    0.104954
<Other>                 0.110625
Alabama                 0.146060
Name: predicted, dtype: float64

In [77]:
temp.sort("measured")["measured"].head()

location
Florida           2.94
Pennsylvania      2.97
New Jersey        2.99
North Carolina    3.11
Texas             3.14
Name: measured, dtype: float64