In [5]:
import re
import math
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import plotly
from plotly.offline import iplot, init_notebook_mode

init_notebook_mode(connected=True)

def plot(figure):
    plotly.offline.iplot(figure)

In [6]:
states = pd.read_csv('state_alpha_to_numeric.csv', sep=',', header=None, names=['alpha','num'])
alphas = list(states['alpha'])
nums = list(states['num'])
statenum = {state:code for (state,code) in zip(alphas,nums)}
for state in statenum.keys():
    statenum[state] = str(statenum[state]).zfill(2)
    

In [7]:
agCensus = pd.read_csv('2017_cdqt_data.txt', sep='\t', header=0, low_memory=False)
agCensusAnimals = agCensus[
        (agCensus['CENSUS_CHAPTER'] == 2) &
        (agCensus['SECTOR_DESC'] == 'ANIMALS & PRODUCTS') &
        (agCensus['AGG_LEVEL_DESC'] == 'COUNTY') & 
        (agCensus['CENSUS_TABLE'] != 1)
    ]


agCensusAnimals['STATE_FIPS_CODE'] = agCensusAnimals['STATE_FIPS_CODE'].apply(lambda x: str(int(x)).zfill(2))
agCensusAnimals['COUNTY_CODE'] = agCensusAnimals['COUNTY_CODE'].apply(lambda x: str(int(x)).zfill(3))
agCensusAnimals['FIPS'] = agCensusAnimals['STATE_FIPS_CODE'] + agCensusAnimals['COUNTY_CODE']

agCensusAnimals.drop(columns=['CENSUS_CHAPTER', 'CENSUS_TABLE', 'SECTOR_DESC',
                              'AGG_LEVEL_DESC', 'STATE_FIPS_CODE', 'COUNTY_CODE'], inplace=True)
agCensusAnimals.columns

Index(['CENSUS_ROW', 'CENSUS_COLUMN', 'SHORT_DESC', 'COMMODITY_DESC',
       'STATE_ALPHA', 'STATE_NAME', 'COUNTY_NAME', 'DOMAINCAT_DESC', 'VALUE',
       'FIPS'],
      dtype='object')

In [4]:
animalCensus = {
    state : 
        agCensusAnimals[
            (agCensusAnimals['STATE_ALPHA'] == state) &
            agCensusAnimals['COUNTY_NAME'].str.contains('.+', na=False)
        ]
    for state in statenum.keys()}

In [None]:
animalCensus['TX']['STATE_ALPHA'].unique()

In [5]:
shortdescs = list(animalCensus['TX']['SHORT_DESC'].unique())
inventory_filter = re.compile('^.* - INVENTORY')
animal_keys = list(filter(inventory_filter.match, shortdescs))
animal_keys

['CATTLE, INCL CALVES - INVENTORY',
 'CATTLE, COWS - INVENTORY',
 'CATTLE, COWS, BEEF - INVENTORY',
 'CATTLE, COWS, MILK - INVENTORY',
 'CATTLE, (EXCL COWS) - INVENTORY',
 'CATTLE, ON FEED - INVENTORY',
 'HOGS - INVENTORY',
 'SHEEP, INCL LAMBS - INVENTORY',
 'GOATS - INVENTORY',
 'GOATS, MILK - INVENTORY',
 'GOATS, ANGORA - INVENTORY',
 'GOATS, MEAT & OTHER - INVENTORY',
 'EQUINE, HORSES & PONIES - INVENTORY',
 'EQUINE, MULES & BURROS & DONKEYS - INVENTORY',
 'CHICKENS, LAYERS - INVENTORY',
 'CHICKENS, PULLETS, REPLACEMENT - INVENTORY',
 'CHICKENS, BROILERS - INVENTORY',
 'TURKEYS - INVENTORY',
 'CHUKARS - INVENTORY',
 'DUCKS - INVENTORY',
 'EMUS - INVENTORY',
 'GEESE - INVENTORY',
 'GUINEAS - INVENTORY',
 'PARTRIDGES, HUNGARIAN - INVENTORY',
 'OSTRICHES - INVENTORY',
 'PEAFOWL, HENS & COCKS - INVENTORY',
 'PHEASANTS - INVENTORY',
 'PIGEONS & SQUAB - INVENTORY',
 'QUAIL - INVENTORY',
 'RHEAS - INVENTORY',
 'CHICKENS, ROOSTERS - INVENTORY',
 'POULTRY, OTHER - INVENTORY',
 'HONEY, BEE CO

In [6]:
animalInventories = {
    state : 
    {key : animalCensus[state][animalCensus[state]['SHORT_DESC'] == key] for key in animal_keys}
    for state in statenum.keys()
}

for state in statenum.keys():
    for key in animal_keys:
        animalInventories[state][key] = animalInventories[state][key][animalInventories[state][key]['DOMAINCAT_DESC'].isnull()]

for state in statenum.keys():
    for key in animal_keys:
        #if animalInventories[state][key].shape[0] > 0:
        animalInventories[state][key] = animalInventories[state][key][~animalInventories[state][key]['VALUE'].str.contains('D')]   
        animalInventories[state][key]['VALUE'] = animalInventories[state][key]['VALUE'].apply(lambda s: int(s.replace(',', '')))
        #animalInventories[state][key] = animalInventories[state][key][animalInventories[state][key]['VALUE'].apply(lambda x: isinstance(x,float))]
        #animalInventories[state][key] = animalInventories[state][key][animalInventories[state][key]['VALUE'].notnull()]                                                                                              

#for key in animal_keys:
#       print(key, animalInventories['TX']['CENSUS_ROW'].unique())
        
#animalInventories['TX']['CATTLE, INCL CALVES - INVENTORY']['CENSUS_ROW'].unique()     

for state in statenum.keys():
    for key in animal_keys:
        animalInventories[state][key].drop(columns=['CENSUS_ROW', 'CENSUS_COLUMN', 'DOMAINCAT_DESC', 'STATE_ALPHA', 'STATE_NAME', 'SHORT_DESC', 'COMMODITY_DESC'], inplace=True)

In [7]:
#for state in statenum.keys():
#    for key in animal_keys:
#        animalInventories[state][key]['COUNTY_CODE'] = animalInventories[state][key]['COUNTY_CODE'].astype(int)
#        animalInventories[state][key].set_index('COUNTY_CODE', inplace=True)

for state in statenum.keys():
    for key in animal_keys:
        animalInventories[state][key].set_index('FIPS', inplace=True)

In [None]:
animalInventories['NY']['CATTLE, INCL CALVES - INVENTORY']

# Should now try to reindex by county code here.

In [38]:
disease_data_states = ['TX', 'NY', 'WA']
disease_data_diseases = ['Campylobacteriosis', 'Salmonellosis', 'STEC']
disease_data = {
    state : {disease :
        pd.read_csv('Disease Data/' + state + '/' + state + '_' + disease + '.csv',
                    header = 0,
                    sep = ',')
        for disease in disease_data_diseases}
    for state in disease_data_states}

In [1]:
def rate_columns(state_alpha, disease):
    columns = list(disease_data[state_alpha][disease].columns)
    regex = re.compile(r'^\d\d\d\d RATE$')
    return list(filter(regex.search, columns))

def corr_count(s):
    return s.split()[0] + ' COUNT'

countyInfo = pd.read_csv('US_County_Info.csv', sep=',', header=0)
countyInfo.set_index('FIPS', inplace=True, drop=False)

def FIPS(state_alpha, county_name):
    county = county_name.upper()
    state = state_alpha
    fips = list(countyInfo[(countyInfo['County'].str.upper()==county.upper()) & (countyInfo['State']==state)]['FIPS'])[0]
    return str(fips)

def infer_rate(state_alpha, county_name, count, rate):
    if np.isnan(rate):
        fips = int(FIPS(state_alpha, county_name))
        pop_in_100K = float(countyInfo['Population\n(2010)'][fips]) / 100000
        return float(count) / pop_in_100K
    else:
        return float(rate)

NameError: name 'pd' is not defined

In [40]:
countyInfo

Unnamed: 0_level_0,State,FIPS,County,County Seat(s),Population\n(2010),Land Area\nkm²,Land Area\nmi²,Water Area\nkm²,Water Area\nmi²,Total Area\nkm²,Total Area\nmi²,Latitude,Longitude
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1001,AL,1001,Autauga,Prattville,54571,1539.582,594.436,25.776,9.952,1565.358,604.388,+32.536382°,–86.644490°
1003,AL,1003,Baldwin,Bay Minette,182265,4117.522,1589.784,1133.19,437.527,5250.712,2027.311,+30.659218°,–87.746067°
1005,AL,1005,Barbour,Clayton,27457,2291.819,884.876,50.865,19.639,2342.684,904.515,+31.870670°,–85.405456°
1007,AL,1007,Bibb,Centreville,22915,1612.481,622.582,9.289,3.587,1621.770,626.169,+33.015893°,–87.127148°
1009,AL,1009,Blount,Oneonta,57322,1669.962,644.776,15.157,5.852,1685.119,650.628,+33.977448°,–86.567246°
1011,AL,1011,Bullock,Union Springs,10914,1613.057,622.805,6.057,2.338,1619.113,625.143,+32.101759°,–85.717261°
1013,AL,1013,Butler,Greenville,20947,2011.977,776.829,2.727,1.053,2014.704,777.882,+31.751667°,–86.681969°
1015,AL,1015,Calhoun,Anniston,118572,1569.190,605.868,16.624,6.419,1585.814,612.287,+33.771706°,–85.822513°
1017,AL,1017,Chambers,Lafayette,34215,1545.009,596.531,17.048,6.582,1562.057,603.113,+32.917943°,–85.391812°
1019,AL,1019,Cherokee,Centre,25989,1434.076,553.700,119.859,46.278,1553.935,599.978,+34.069515°,–85.654242°


In [41]:
disease_data['NY']['STEC']

Unnamed: 0,COUNTY,2017 COUNT,2017 RATE,2016 COUNT,2016 RATE,2015 COUNT,2015 RATE,2014 COUNT,2014 RATE
0,ALBANY,5,1.6,9,2.9,3,0.9,7,2.3
1,ALLEGANY,0,0.0,1,2.1,1,2.1,0,0.0
2,BROOME,3,1.5,5,2.5,4,2.0,6,3.0
3,CATTARAUGUS,2,2.6,1,1.3,1,1.3,0,0.0
4,CAYUGA,3,3.9,2,2.6,0,0.0,0,0.0
5,CHAUTAUQUA,2,1.5,3,2.3,4,3.0,3,2.3
6,CHEMUNG,0,0.0,1,1.1,1,1.1,1,1.1
7,CHENANGO,1,2.1,0,0.0,1,2.0,1,2.0
8,CLINTON,1,1.2,1,1.2,0,0.0,1,1.2
9,COLUMBIA,0,0.0,2,3.3,1,1.6,0,0.0


In [42]:
for state in disease_data_states:
    for disease in disease_data_diseases:
        for col in rate_columns(state, disease):
            disease_data[state][disease][col] = pd.to_numeric(disease_data[state][disease][col], errors='coerce')
            #disease_data[state][disease][col] = disease_data[state][disease].apply(lambda x: infer_rate(state, x['COUNTY'], x[corr_count(col)], x[col]))

In [43]:
disease_data['WA']['STEC']

Unnamed: 0,COUNTY,2013 COUNT,2013 RATE,2014 COUNT,2014 RATE,2015 COUNT,2015 RATE,2016 COUNT,2016 RATE,2017 COUNT,2017 RATE
0,Adams,0,0.0,1,5.2,3,,0,0.0,0,0.0
1,Asotin,2,,1,4.6,1,,0,0.0,1,
2,Benton,12,6.5,9,4.9,8,4.2,12,6.3,11,5.7
3,Chelan,5,6.8,3,4.0,4,,1,,2,
4,Clallam,2,,0,0.0,2,,0,0.0,4,
5,Clark,51,11.7,27,6.1,45,10.0,25,5.4,38,8.1
6,Columbia,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
7,Cowlitz,0,0.0,3,,8,7.7,3,,6,5.7
8,Douglas,0,0.0,0,0.0,1,,0,0.0,0,0.0
9,Ferry,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [44]:
for state in disease_data_states:
    for disease in disease_data_diseases:
        for col in rate_columns(state, disease):
            disease_data[state][disease][col] = disease_data[state][disease].apply(lambda x: infer_rate(state, x['COUNTY'], x[corr_count(col)], x[col]), axis=1)

In [45]:
animalCensus['TX']['STATE_ALPHA'].unique()

array(['TX'], dtype=object)

In [46]:
disease_data['WA']['STEC']

Unnamed: 0,COUNTY,2013 COUNT,2013 RATE,2014 COUNT,2014 RATE,2015 COUNT,2015 RATE,2016 COUNT,2016 RATE,2017 COUNT,2017 RATE
0,Adams,0,0.0,1,5.2,3,16.018795,0,0.0,0,0.0
1,Asotin,2,9.24941,1,4.6,1,4.624705,0,0.0,1,4.624705
2,Benton,12,6.5,9,4.9,8,4.2,12,6.3,11,5.7
3,Chelan,5,6.8,3,4.0,4,5.52082,1,1.380205,2,2.76041
4,Clallam,2,2.800964,0,0.0,2,2.800964,0,0.0,4,5.601927
5,Clark,51,11.7,27,6.1,45,10.0,25,5.4,38,8.1
6,Columbia,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
7,Cowlitz,0,0.0,3,2.929401,8,7.7,3,2.929401,6,5.7
8,Douglas,0,0.0,0,0.0,1,2.602066,0,0.0,0,0.0
9,Ferry,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [47]:
for state in disease_data_states:
    for disease in disease_data_diseases:
        disease_data[state][disease]['AVG_RATE'] = 0.0
        for column in rate_columns(state, disease):
            disease_data[state][disease]['AVG_RATE'] += disease_data[state][disease][column].astype(float)
        disease_data[state][disease]['AVG_RATE'] = disease_data[state][disease]['AVG_RATE'] / len(rate_columns(state, disease))    

In [48]:
disease_data['WA']['STEC']

Unnamed: 0,COUNTY,2013 COUNT,2013 RATE,2014 COUNT,2014 RATE,2015 COUNT,2015 RATE,2016 COUNT,2016 RATE,2017 COUNT,2017 RATE,AVG_RATE
0,Adams,0,0.0,1,5.2,3,16.018795,0,0.0,0,0.0,4.243759
1,Asotin,2,9.24941,1,4.6,1,4.624705,0,0.0,1,4.624705,4.619764
2,Benton,12,6.5,9,4.9,8,4.2,12,6.3,11,5.7,5.52
3,Chelan,5,6.8,3,4.0,4,5.52082,1,1.380205,2,2.76041,4.092287
4,Clallam,2,2.800964,0,0.0,2,2.800964,0,0.0,4,5.601927,2.240771
5,Clark,51,11.7,27,6.1,45,10.0,25,5.4,38,8.1,8.26
6,Columbia,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0
7,Cowlitz,0,0.0,3,2.929401,8,7.7,3,2.929401,6,5.7,3.851761
8,Douglas,0,0.0,0,0.0,1,2.602066,0,0.0,0,0.0,0.520413
9,Ferry,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0


In [None]:
cow_keys = ['CATTLE, COWS, BEEF - INVENTORY',
                 'CATTLE, COWS, MILK - INVENTORY',
                 'CATTLE, (EXCL COWS) - INVENTORY']

pig_keys = ['HOGS - INVENTORY']

chicken_keys = ['CHICKENS, LAYERS - INVENTORY',
                'CHICKENS, PULLETS, REPLACEMENT - INVENTORY',
                'CHICKENS, BROILERS - INVENTORY',
                'CHICKENS, ROOSTERS - INVENTORY']

turkey_keys =  ['TURKEYS - INVENTORY']

s1 = 'CATTLE, COWS - INVENTORY'
s2 = 'CATTLE, COWS, BEEF - INVENTORY'
s3 = 'CATTLE, COWS, MILK - INVENTORY'
s4 = 'CATTLE, (EXCL COWS) - INVENTORY'
s5 = 'CATTLE, INCL CALVES - INVENTORY'
s6 = 'CATTLE, ON FEED - INVENTORY'

d = animalInventories['NY']

d[s1]['VALUE'] + d[s2]['VALUE'] + d[s3]['VALUE'] + d[s4]['VALUE'] - d[s5]['VALUE']

things = [s1, s2, s3, s4, s5, s6]

ck = chicken_keys

layers = 0
broilers = 0

for c in d[s1].index:
    print(d[s1]['COUNTY_NAME'][c])
    #for s in chicken_keys:
        #if c in d[s].index:
        #    print(d[s]['VALUE'][c])
    if (c in d[ck[0]].index) and (c in d[ck[2]].index):
        layers = layers + d[ck[0]]['VALUE'][c]
        broilers = broilers + d[ck[2]]['VALUE'][c]
        print('LAYERS:', d[ck[0]]['VALUE'][c], 'BROILERS:', d[ck[2]]['VALUE'][c])
        
print('TOTAL LAYERS:', layers, 'TOTAL BROILERS:', broilers)

In [None]:
animalInventories['NY']['CATTLE, INCL CALVES - INVENTORY']

for state in statenum.keys():
    for key in animal_keys:
        print(animalInventories[state][key]['COMMODITY_DESC'].unique().size)

In [None]:
for key in animal_keys:
    print(key, animalInventories['TX'][key]['CENSUS_ROW'].unique().size,  animalInventories['TX'][key]['CENSUS_COLUMN'].unique().size)
    print(animalInventories['TX'][key]['CENSUS_ROW'].unique())

In [None]:
for key in animal_keys:
    print(key, animalInventories['TX'][key].count())

In [None]:
def print_stuff(state_alpha, key):
    #key = key_pre + ' - INVENTORY'
    #print(animalInventories[state_alpha][key]['CENSUS_ROW'].unique())
    print(animalInventories[state_alpha][key]['CENSUS_COLUMN'].unique())
    #print(animalInventories[state_alpha][key]['CENSUS_TABLE'].unique())
    #print(animalInventories[state_alpha][key]['CENSUS_CHAPTER'].unique())
    
def printy(key):
    print_stuff('TX', key)

for key in animal_keys:
    print(key)
    for state in statenum.keys():
        print(':::' + state)
        print_stuff(state, key)

In [None]:
agCensus = pd.read_csv('2017_cdqt_data.txt', sep='\t', header=0, low_memory=False)
texasAnimals_pre1 = agCensus[agCensus['STATE_ALPHA'] == "TX"]
texasAnimals_pre2 = texasAnimals_pre1[texasAnimals_pre1['COUNTY_NAME'].str.contains('.+', na=False)]
texasAnimals = texasAnimals_pre2[texasAnimals_pre2['SECTOR_DESC'] == 'ANIMALS & PRODUCTS']
#texasAnimals = texasAnimals_pre3[(texasAnimals_pre3['CENSUS_CHAPTER']==2) & (texasAnimals_pre3['CENSUS_TABLE']==19)]

In [None]:
hogs_pre1 = texasAnimals[texasAnimals['SHORT_DESC'] == 'HOGS - INVENTORY']
hogs_pre1['SECTOR_DESC'].unique()
hogs_pre1['VALUE'] = pd.to_numeric(hogs_pre1['VALUE'], errors='coerce')
#hogs_pre1['VALUE'] = hogs_pre1['VALUE'].astype(int, errors='ignore')
hogs_pre2 = hogs_pre1[hogs_pre1['VALUE'].apply(lambda x: isinstance(x,float))]
hogs = hogs_pre2[hogs_pre2['VALUE'].notnull()]
hogs['VALUE'] = hogs['VALUE'].astype(int)
hogs

In [None]:
#texasChickenInventory = agCensusTexas3[agCensusTexas3['COMMODITY_DESC'] == 'CHICKENS']
texasChickens = texasChickenInventory[(texasChickenInventory['CENSUS_CHAPTER']==2) & (texasChickenInventory['CENSUS_TABLE']==19)]
texasBroilers = texasChickens[texasChickens['SHORT_DESC']=='CHICKENS, BROILERS - INVENTORY']
texasLayers = texasChickens[texasChickens['SHORT_DESC']=='CHICKENS, LAYERS - INVENTORY']

In [None]:
texasLayers['COUNTY_CODE'] = texasLayers['COUNTY_CODE'].astype(int)
texasLayers['FIPS'] = (texasLayers['COUNTY_CODE'] + 1000*48).apply(str)
texasLayers.set_index('FIPS', inplace=True, drop=False)
texasLayers['VALUE'] = texasLayers['VALUE'].apply(lambda s: s.replace(',', ''))
texasLayers2 = texasLayers[texasLayers['VALUE'] != '(D)']
texasLayers2['VALUE'] = texasLayers2['VALUE'].astype(int)

In [None]:
texasBroilers['COUNTY_CODE'] = texasBroilers['COUNTY_CODE'].astype(int)
texasBroilers['FIPS'] = (texasBroilers['COUNTY_CODE'] + 1000*48).apply(str)
texasBroilers.set_index('FIPS', inplace=True, drop=False)
texasBroilers['VALUE'] = texasBroilers['VALUE'].apply(lambda s: s.replace(',', ''))
texasBroilers2 = texasBroilers[texasBroilers['VALUE'] != '(D)']
texasBroilers2['VALUE'] = texasBroilers2['VALUE'].astype(int)

In [None]:
# Code to map increasing list of values to RGB colors
# start is a list of initial r, g, b values
# end is a list of final r, g, b values
# Color is linearly interpolated

def valsToColors(values, start, end):
    maxValue = values[-1]
    minValue = values[0]
    factors = list(map(lambda value: (value - minValue) / (maxValue - minValue), values))
    reds   = list(map(lambda t: int(round((1-t)*start[0] + t*end[0])), factors));
    greens = list(map(lambda t: int(round((1-t)*start[1] + t*end[1])), factors));
    blues  = list(map(lambda t: int(round((1-t)*start[2] + t*end[2])), factors));
    rgbs = [f'rgb({r}, {g}, {b})' for (r,g,b) in zip(reds, greens, blues)]
    return rgbs

In [None]:
fips = list(texasBroilers2['FIPS'])
values = list(texasBroilers2['VALUE'])
endpts = list(np.mgrid[min(values):max(values):7j])
colorscale = valsToColors(endpts, [200,255,0], [255,50,0])
endpts.pop(0)
endpts.pop(-1)

fig = ff.create_choropleth(fips=fips, values=values, scope=['TX'],
                           binning_endpoints=endpts, colorscale=colorscale,
                           county_outline={'color': 'rgb(0,0,0)', 'width': 0.5},
                           legend_title='Number of Broiler Chickens')

fig.layout.template = None
fig.show()

In [None]:
texasSTEC = pd.read_csv('Texas_STEC_By_County.csv', sep=',', header=0)

In [None]:
countyInfo = pd.read_csv('US_County_Info.csv', sep=',', header=0)
countyInfo.set_index('FIPS', inplace=True, drop=False)

def tx_fips(county):
    fips = list(countyInfo[(countyInfo['County'].str.upper()==county.upper()) & (countyInfo['State']=='TX')]['FIPS'])[0]
    return str(fips)

In [None]:
texasSTEC['FIPS'] = [tx_fips(county) for county in list(texasSTEC['County'])]
texasSTEC.set_index('FIPS', inplace=True, drop=False)

In [None]:
def agg_rate(fips):
    agg_rate = 0
    for i in range(2008,2018):
        agg_rate += texasSTEC[str(i)+' IR'][fips]
    return agg_rate

In [None]:
texasSTEC['AGG_RATE'] = list(map(agg_rate, texasSTEC.index))

In [None]:
texasSTEC3 = texasSTEC[texasSTEC['FIPS'].isin(texasBroilers2['FIPS'])]              # Intersect frames to avoid 
texasBroilers3 = texasBroilers2[texasBroilers2['FIPS'].isin(texasSTEC3['FIPS'])]    # missing FIPS
x_vals = np.asarray(texasBroilers3.loc[texasBroilers3['FIPS']]['VALUE'])
x_vals = x_vals.astype('float64')
y_vals = np.asarray(texasSTEC3.loc[texasSTEC3['FIPS']]['AGG_RATE'])
y_vals = y_vals.astype('float64')

In [None]:
def errors(m, b, xs, ys):
    return ys - (m*xs + b)

def MSE(m, b, xs, ys):
    return sum(errors(m, b, xs, ys)**2)/xs.size

def dMSE_dm(m, b, xs, ys):
    return -2*sum(xs*errors(m, b, xs, ys))/xs.size
    
def dMSE_db(m, b, xs, ys):
    return -2*sum(errors(m, b, xs, ys))/xs.size

def RMSE(m, b, xs, ys):
    return math.sqrt(MSE(m,b, xs, ys))   

In [None]:
def step(m, b, x_values, y_values, learn_rate):
    m_new = m - learn_rate * dMSE_dm(m, b, x_values, y_values)
    b_new = b - learn_rate * dMSE_db(m,b, x_values, y_values)
    return (m_new, b_new)

def learn(m_initial, b_initial, x_values, y_values, learn_rate, steps):
    m = m_initial
    b = b_initial
    for i in range(steps):
        (m, b) = step(m, b, x_values, y_values, learn_rate)
    return (m, b)

In [None]:
scale = 1/(2*max(x_vals)**2) # This is basically the invese of bound on the Laplacian of the error
(m,b) = learn(1, 25, x_vals, y_vals, scale*0.1, 100000)

In [None]:
(m,b)

In [None]:
def line_trace(m,b, max_val):
    xs = np.linspace(0.0, max_val, 100)
    ys = m*xs + b
    return {'x' : list(xs), 'y' : list(ys), 'mode' : 'lines', 'name' : 'Best Fit Line'}

In [None]:
trace = {'x' : x_vals, 'y' : y_vals, 'mode' : 'markers', 'name' : 'Counties'}
figure2 = {'data': [trace, line_trace(m,b, max(x_vals))], 'layout': {'title': 'STEC Incidence Rate versus Broiler Chicken Population'}}
plot(figure2)

This is surprising! STEC cases seem to be *negatively* correlated with broiler chicken population. We can check that this is not so weak a correlation: here's the Normalized Root Mean Square Error:

In [None]:
RMSE(m, b, x_vals, y_vals) / (max(y_vals) - min(y_vals))

## What is going on here?

It appears as though the correlation for STEC versus broiler chicken population goes the opposite direction as STEC versus catle population. This is not what we expected, so we should try to find an explanation. There might be some third factor which influences both cattle and chicken populations. It may be that chicken population is negatively correlated with the presence of another factor which more strongly influences STEC rates (for example, the presense of another type of animal which is a more favorable carrier of STEC). Ultimately, this indicates a need for multivariate regression. It is hard to make a meaningful conclusion about the effect chickens have on STEC incidents without controlling for other variables.

We can also check to see if there is a relationship between STEC rates and layer chickens:

In [None]:
texasSTEC4 = texasSTEC[texasSTEC['FIPS'].isin(texasLayers2['FIPS'])]              # Intersect frames to avoid 
texasLayers4 = texasLayers2[texasLayers2['FIPS'].isin(texasSTEC4['FIPS'])]    # missing FIPS
x_vals2 = np.asarray(texasLayers4.loc[texasLayers4['FIPS']]['VALUE'])
x_vals2 = x_vals2.astype('float64')
y_vals2 = np.asarray(texasSTEC4.loc[texasSTEC4['FIPS']]['AGG_RATE'])
y_vals2 = y_vals2.astype('float64')

In [None]:
scale2 = 1/(2*max(x_vals2)**2) # This is basically the invese of bound on the Laplacian of the error
(m2,b2) = learn(1, 25, x_vals2, y_vals2, scale*0.1, 100000)

In [None]:
trace2 = {'x' : x_vals2, 'y' : y_vals2, 'mode' : 'markers', 'name' : 'Counties'}
figure3 = {'data': [trace2, line_trace(m2,b2, max(x_vals2))], 'layout': {'title': 'STEC Incidence Rate versus Layer Chicken Population'}}
plot(figure3)

In [None]:
RMSE(m2, b2, x_vals2, y_vals2) / (max(y_vals2) - min(y_vals2))

## Conclusion

We see here a somewhat weaker correlation, this time positive. We would expect, all else being equal, that the direction of the relationship between chickens and STEC rate would not depend on whether those chickens were raised for meat or eggs, but our simple model is suggesting this is so. This indicates our model is inadequate, and there are some unaccounted for hidden variables.  