Current issue: prevalence data not working with to_timeseries() function.
1. Possibly because the determine_proper_dataset() function is not working.

In [68]:
import pandas as pd
import csv
import us
import matplotlib # for testing
%matplotlib inline

# A list of all the URLs to state and county newdx data sets
raw_newdx_urls = {
    2015: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2015.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2015.xlsx"},
    2014: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2014.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2014.xlsx"},
    2013: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2013.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2013.xlsx"},
    2012: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2012.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2012.xlsx"},
    2011: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2011.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2011.xlsx"},
    2010: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2010.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2010.xlsx"},
    2009: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2009.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2009.xlsx"},
    2008: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_NewDX_2008.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_NewDX_2008.xlsx"}
}

# A list of all the URLs to state and county prev data sets.
# Note that there is no prevalence data for 2008, 2009, or 2015.
raw_prev_urls = {
    2014: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_Prev_2014.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_Prev_2014.xlsx"},
    2013: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_Prev_2013.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_Prev_2013v2.xlsx"},
    2012: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_2012-2-1.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_2012-1.xls"},
    2011: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_20111.xlsx",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_2011-1-1.xlsx"},
    2010: {"state":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_State_2010-1.xls",
        "county":"https://github.com/jamiekasulis/aidsvu_data_grab/raw/master/AIDSVu_County_20101.xls"}
}

In [69]:
def make_raw_df(year, sc, dataset):
    """
    Returns a raw, uncleaned dataframe corresponding to year and state or county in either raw_newdx_urls or raw_prev_urls.
    year is a full four-digit year int, sc is the string "state" or "county", and dataset is the string "prev" or "newdx".
    """
    if dataset == "newdx":
        return pd.read_excel(raw_newdx_urls.get(year).get(sc), skiprows=2)
    elif dataset == "prev":
        if year == 2012:
            # 2012 is a special case where you should not skip any rows.
            return pd.read_excel(raw_prev_urls.get(year).get(sc), skiprows=0)
        elif year == 2011:
            return pd.read_excel(raw_prev_urls.get(year).get(sc), skiprows=0)
        elif year == 2010:
            return pd.read_excel(raw_prev_urls.get(year).get(sc), skiprows=0)
        else:
            return pd.read_excel(raw_prev_urls.get(year).get(sc), skiprows=2)
    else:
        return None

In [70]:
def omit_us_territories(countydf):
    """
    Takes a county data frame and returns a version of it that only includes the 50 U.S. states.
    This is a helper function for clean_newdx_dfs()
    """
    if 'GEO ID' in countydf.columns:
        countydf = countydf[countydf['GEO ID'].astype('int64') < 60000] # U.S. territories/non-states have geo IDs 60000 and above
    return countydf

In [19]:
# make_raw_df for all newdx data and store in newdx_dfs
def make_clean_newdx_dfs():
    """
    Makes and returns a dictionary of newdx data indexed by year and state/county.
    County data will have their U.S. territories removed.
    """
    newdx_dfs = {}
    for year in raw_newdx_urls.keys():
        # make the data frames
        # strip columns and set them to lower case
        newdx_dfs[year] = {"state": make_columns_lowercase(omit_us_territories(make_raw_df(year, 'state', 'newdx'))),
                       "county": make_columns_lowercase(omit_us_territories(make_raw_df(year, 'county', 'newdx')))}
        
    return newdx_dfs

In [28]:
# make_raw_df for all prev data and store in prev_dfs
def make_clean_prev_dfs():
    """
    Makes and returns a dictionary of prevalence data indexed by year and state/county.
    County data will have their U.S. territories removed.
    """
    prev_dfs = {}
    for year in raw_prev_urls.keys():
        print(year)
        prev_dfs[year] = {"state": make_raw_df(year, 'state', 'prev').rename(columns=lambda x: x.strip().lower()),
                       "county": omit_us_territories(make_raw_df(year, 'county', 'prev')).rename(columns=lambda x: x.strip().lower())}

    return prev_dfs

In [29]:
prev_dfs = make_clean_prev_dfs()

2014
2013
2012
2011
2010


In [30]:
prev_dfs[2010]['state'].columns

Index(['state', 'state fips code', 'state rate', 'state rate decile',
       'state rate stability', 'state cases', 'state cases decile',
       'male rate', 'male rate decile', 'male rate stability',
       ...
       'gonorrhea rank', 'gonorrhea cases', 'gonorrhea rate', 'syphilis rank',
       'syphilis cases', 'syphilis rate', 'black/white male rr',
       'hispanic/white male rr', 'black/white female rr',
       'hispanic/white female rr'],
      dtype='object', length=148)

In [31]:
newdx_dfs = make_clean_newdx_dfs()

In [11]:
newdx_dfs[2008]['county'].columns

Index(['geo id', 'year', 'state abbreviation', 'state', 'county name',
       'new diagnoses rate', 'new diagnoses rate stability',
       'new diagnoses cases', '2013 nchs urbanicity code'],
      dtype='object')

In [71]:
def state_or_county(loc_name):
    """
    A helper function for to_timeseries. Given the name of a location, this function
    will return 'county' or 'city' based on whether loc_name ends with 'county' or not.
    """
    print("state_or_county(" + loc_name + ")...")
    if 'county' in loc_name.lower():
        print("\treturning 'county'...")
        return 'county' # return 'county name' and not 'county' because this is the name of the column in county data
    else:
        print("\treturning 'state'...")
        return 'state'

In [72]:
def determine_proper_dataset(column_name, loc_type):
    """
    A helper function for determine_proper_years() and to_timeseries().
    Returns either newdx_dfs or prev_dfs, whichever one has a column called column_name.
    loc_type should be either 'city' or 'county'
    """
    print("determine_proper_dataset(" + column_name + ", " + loc_type + ")")
    
    # Check newdx_dfs first.
    for year in newdx_dfs.keys():
        print("\tLooking at newdx_dfs[" + year + "][" + loc_type + "]")
        df = newdx_dfs[year][loc_type]
        df_cols = df.columns.tolist()
        if column_name in df_cols:
            print("\t" + column_name + " found in newdx_dfs!")
            return newdx_dfs
    
    # Check prev_dfs
    for year in prev_dfs.keys():
        print("\tLooking at prev_dfs[" + year + "][" + loc_type + "]")
        df = newdx_dfs[year][loc_type]
        df_cols = df.columns.tolist()
        if column_name in df_cols:
            print("\t" + column_name + " found in prev_dfs!")
            return prev_dfs

    print("ERROR: column_name does not exist in newdx_dfs or prev_dfs based on argument loc_type (county or city).")
    return None

In [34]:
# Determine the proper start and end years to build the time series on.
# If start_year is not a key in the dictionary of data frames, do the closest year working DOWN
# For end_year, do the closest year working UP.
def determine_proper_years(start_year, end_year, data_dict):
    """
    A helper function for to_timeseries. It verifies that data_dict has data for start_year and end_year.
    If not, it will return the minimum start_year and maximum end_year to make a time series as wide as possible.
    Returns a list [start, end]
    data_dict should be prev_dfs or newdx_dfs.
    """
    print("determine_proper_years(" + str(start_year) + ", " + str(end_year))
    start = None
    end = None
    
    if start_year in data_dict.keys():
        start = start_year
    else:
        start = min(list(data_dict.keys()))
    if end_year in data_dict.keys():
        end = end_year
    else:
        end = max(list(data_dict.keys()))

    print("\tProper years: " + str([start, end]))
    return [start, end]

In [35]:
def get_valid_years(year_range, data_dict):
    """
    A helper function for get_timeseries.
    Given a 2-element list [start_year, end_year] and a data_dict that is either newdx_dfs or prev_dfs,
    return an ordered list (ascending) of years that are actually in data_dict.
    This function is meant to account for the fact that there may be missing years of data.
    """
    print("get_valid_years(" + str(year_range) + ")")
    start = year_range[0]
    end = year_range[1]
    
    years = list(range(start, end+1)) # all the years in year_range
    proper_years = [] # ordered list of the years in year_range that we have data for
    
    # add the years that we have data for to proper_years
    keys = data_dict.keys()
    for year in years:
        if year in keys:
            proper_years.append(year)
    
    # impose ascending order on proper_years
    proper_years.sort()
    print("\tValid years: " + str(proper_years))
    return proper_years

In [76]:
def to_timeseries(column_name, location, start_year=2008, end_year=2014, loc_header = 'default', data_set='newdx'):
    """
    Returns a 2D table (year x value) going from start_year to end_year, where the values are from whichever column in
    either newdx or prev data that matches the column_name argument.
    The argument data_set should be either "prev" or "newdx". Defaults to newdx if no value is passed.
    Defaults to 2008 and 2014 for start and end years.
    """
    # Determine if location is a city or county
    # city_or_county() returns 'city' or 'county'
    # we will switch that with the proper header name, loc_header
    # and either 'state' or 'county' to index in the dictionary.
    loc_type = loc_header # give an initial value assignment
    if loc_header == 'default':
        loc_type = state_or_county(location)
        if loc_type == 'county':
            loc_header = 'county name'
            loc_type = 'county'
        elif loc_type == 'state':
            loc_header = 'state'
            loc_type = 'state'
        else:
            loc_type = loc_header
    
    # Grab the dictionary of data frames that we should be looking for column_name in
    data_dict = None
    if data_set == "prev":
        data_dict = prev_dfs
    elif data_set == "newdx":
        data_dict = newdx_dfs
    else:
        # Determine what the proper data set is.
        data_dict = determine_proper_dataset(column_name, loc_type)
        # If column_name does not exist in any data set, return from this function.
        if data_dict == None:
            return
    
    if data_set == "newdx":
        print("newdx")
    elif data_set == "prev":
        print("prev")
    else:
        print("Something is wrong with selecting the data set!")
    
    # Verify that the time range is valid
    time_range = determine_proper_years(start_year, end_year, data_dict)
    
    # Make an ordered list of the valid years in the time range (there may be holes), which we will iterate through later
    data_years = get_valid_years(time_range, data_dict)
    
    # Create the time series.
    time_series = pd.DataFrame(columns = ['year', column_name])
    print(time_series.head())
    
    # Iterate through every year in data_years, grabbing the value for column_name and storing it to the dataframe
    # called time_series.
    for dy in data_years:
        current_df = data_dict[dy][loc_type]
        value = current_df[current_df[loc_header] == location][column_name].iloc[0]
        time_series = time_series.append({'year': dy, column_name: value}, ignore_index=True)
    
    return time_series[['year', column_name]]

In [77]:
to_timeseries('new diagnoses state rate', 'Connecticut', 2008, 2014)

state_or_county(Connecticut)...
	returning 'state'...
newdx
determine_proper_years(2008, 2014
	Proper years: [2008, 2014]
get_valid_years([2008, 2014])
	Valid years: [2008, 2009, 2010, 2011, 2012, 2013, 2014]
Empty DataFrame
Columns: [year, new diagnoses state rate]
Index: []


Unnamed: 0,year,new diagnoses state rate
0,2008,12
1,2009,12
2,2010,13
3,2011,12
4,2012,10
5,2013,11
6,2014,10


In [78]:
to_timeseries('new diagnoses black rate', 'Connecticut', 2008, 2016)

state_or_county(Connecticut)...
	returning 'state'...
newdx
determine_proper_years(2008, 2016
	Proper years: [2008, 2015]
get_valid_years([2008, 2015])
	Valid years: [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
Empty DataFrame
Columns: [year, new diagnoses black rate]
Index: []


Unnamed: 0,year,new diagnoses black rate
0,2008,51
1,2009,56
2,2010,53
3,2011,47
4,2012,42
5,2013,44
6,2014,46
7,2015,33


In [79]:
newdx_dfs[2008]['county']['county name']

0           Autauga County
1           Baldwin County
2           Barbour County
3              Bibb County
4            Blount County
5           Bullock County
6            Butler County
7           Calhoun County
8          Chambers County
9          Cherokee County
10          Chilton County
11          Choctaw County
12           Clarke County
13             Clay County
14         Cleburne County
15           Coffee County
16          Colbert County
17          Conecuh County
18            Coosa County
19        Covington County
20         Crenshaw County
21          Cullman County
22             Dale County
23           Dallas County
24           Dekalb County
25           Elmore County
26         Escambia County
27           Etowah County
28          Fayette County
29         Franklin County
               ...        
3114       Washburn County
3115     Washington County
3116       Waukesha County
3117        Waupaca County
3118       Waushara County
3119      Winnebago County
3

In [80]:
for c in prev_dfs[2010]['state'].columns:
    print(c)

state
state fips code
state rate
state rate decile
state rate stability
state cases
state cases decile
male rate
male rate decile
male rate stability
male cases
male cases decile
female rate
female rate decile
female rate stability
female cases
female cases decile
white rate
white rate decile
white rate stability
white cases
white cases decile
black rate
black rate decile
black rate stability
black cases
black cases decile
hispanic rate
hispanic rate decile
hispanic rate stability
hispanic cases
hispanic cases decile
native american rate
native american rate decile
native american rate stability
native american cases
native american cases decile
asian rate
asian rate decile
asian rate stability
asian cases
asian cases decile
native hawaiian rate
native hawaiian rate decile
native hawaiian rate stability
native hawaiian cases
native hawaiian cases decile
age 13-24 rate
age 13-24 rate decile
age 13-24 rate stability
age 13-24 cases
age 13-24 cases decile
age 25-34 rate
age 25-34 rate dec

In [81]:
# Test: state newdx
to_timeseries('new diagnoses black rate', 'Connecticut', 2008, 2018)

state_or_county(Connecticut)...
	returning 'state'...
newdx
determine_proper_years(2008, 2018
	Proper years: [2008, 2015]
get_valid_years([2008, 2015])
	Valid years: [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
Empty DataFrame
Columns: [year, new diagnoses black rate]
Index: []


Unnamed: 0,year,new diagnoses black rate
0,2008,51
1,2009,56
2,2010,53
3,2011,47
4,2012,42
5,2013,44
6,2014,46
7,2015,33


In [82]:
# Test: county newdx
newdx_dfs[2010]['county'].columns
to_timeseries('new diagnoses rate', 'Fairfield County', 2008, 2018)

state_or_county(Fairfield County)...
	returning 'county'...
newdx
determine_proper_years(2008, 2018
	Proper years: [2008, 2015]
get_valid_years([2008, 2015])
	Valid years: [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
Empty DataFrame
Columns: [year, new diagnoses rate]
Index: []


Unnamed: 0,year,new diagnoses rate
0,2008,15
1,2009,14
2,2010,14
3,2011,13
4,2012,10
5,2013,12
6,2014,13
7,2015,8


In [86]:
# Test: state prev
to_timeseries('black rate', 'Connecticut', 2010, 2012, data_set='prev')

state_or_county(Connecticut)...
	returning 'state'...
prev
determine_proper_years(2010, 2012
	Proper years: [2010, 2012]
get_valid_years([2010, 2012])
	Valid years: [2010, 2011, 2012]
Empty DataFrame
Columns: [year, black rate]
Index: []


IndexError: single positional indexer is out-of-bounds

In [85]:
# Test: county prev
prev_dfs[2010]['county'].columns
to_timeseries('black rate', 'Hartford County', 2010, 2014, 'prev')

newdx
determine_proper_years(2010, 2014
	Proper years: [2010, 2014]
get_valid_years([2010, 2014])
	Valid years: [2010, 2011, 2012, 2013, 2014]
Empty DataFrame
Columns: [year, black rate]
Index: []


KeyError: 'prev'