In [19]:
import csv
import math
import plotly
import numpy
import sklearn.decomposition

In [2]:
def getPopByFips():
    with open("PEP_2015_PEPANNRES.csv") as fh:
        r = csv.DictReader(fh)
        r.next() # skip second header row
        return {int(x['GEO.id2']):int(x['respop72015']) for x in r}

In [3]:
def getNameByFips():
    with open("PEP_2015_PEPANNRES.csv") as fh:
        r = csv.DictReader(fh)
        r.next() # skip second header row
        return {int(x['GEO.id2']):x['GEO.display-label'] for x in r}

In [4]:
topics = [u'The Economy',
       u'Religion', u'Racial Issues', u'Homeland Security & Terrorism',
       u'Government Ethics', u'Taxes', u'Iraq, Syria & ISIS', u'LGBT Issues',
       u'Crime & Criminal Justice', u'Education', u'Immigration',
       u'Campaign Finance', u'Healthcare',
       u'The Environment, EPA & Energy Policy', u'Guns', u'Judicial Issues',
       u'Government Assistance', u'Russia & Ukraine', u'Abortion',
       u'Israel & Palestine', u'North Korea', u'Voting Law', u'China',
       u'Gender Equality', u'Regulations & The Size of Government', u'Iran',
       u'Government Spending', u'Intelligence Gathering, The NSA & Privacy',
       u'Trade', u'Drug Policy', u'Cuba', u'Public Health']

In [5]:
def getTopicsByFips():
    with open("FB_Policy_Issues_Conversation_by_County_Feb_May.csv") as fh:
        r = csv.DictReader(fh)
        return {int(x['county_fips']):{t:float(x[t] or '0') for t in topics} for x in r}

In [6]:
def getNationalTopics():
    with open("FB_Policy_Issues_Conversation_by_National_Feb_May.csv") as fh:
            r = csv.DictReader(fh)
            usa = r.next()
            return {t:float(usa[t] or '0') for t in topics}

In [80]:
def l2norm(a,b):
    return math.sqrt(sum([math.pow(a[t]-b[t],2) for t in topics]))

In [8]:
pop_by_fips = getPopByFips()
name_by_fips = getNameByFips()

In [9]:
topics_by_fips = getTopicsByFips()

In [10]:
national_topics = getNationalTopics()

In [81]:
points = [(pop_by_fips[fips],l2norm(foo,national_topics)) for fips,foo in topics_by_fips.iteritems()]

In [82]:
data = [plotly.graph_objs.Scatter(
    x=[x[0] for x in points],
    y=[x[1] for x in points],
    mode = 'markers'
)]
layout = {
    'xaxis': {
        'type': 'log'
    },
    'yaxis': {
        'range': [0,20]
#         'type': 'log'
    }
}

fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig)

In [107]:
[fips for fips,name in name_by_fips.iteritems() if "Harlan County, Kentucky" in name]

[21095]

In [99]:
[name_by_fips[c] for c in sorted(topics_by_fips.keys(),key=lambda x:l2norm(topics_by_fips[x],topics_by_fips[12086]))[:10]]

['Miami-Dade County, Florida',
 'Broward County, Florida',
 'Taliaferro County, Georgia',
 'Logan County, North Dakota',
 'Hendry County, Florida',
 'Mower County, Minnesota',
 'Hillsborough County, Florida',
 'Monroe County, Florida',
 'Osceola County, Florida',
 'Decatur County, Georgia']

In [108]:
sorted([(topics_by_fips[21095][t]/national_topics[t],t) for t in topics])

[(0.4466666666666667, u'Gender Equality'),
 (0.4545454545454546, u'Public Health'),
 (0.63, u'Intelligence Gathering, The NSA & Privacy'),
 (0.6633663366336634, u'Voting Law'),
 (0.7053726169844022, u'Racial Issues'),
 (0.712, u'Cuba'),
 (0.7165775401069518, u'Iran'),
 (0.7633928571428571, u'Healthcare'),
 (0.7644927536231885, u'Campaign Finance'),
 (0.8373751783166905, u'Taxes'),
 (0.8985313751668892, u'Homeland Security & Terrorism'),
 (0.927404718693285, u'Judicial Issues'),
 (0.9386792452830188, u'The Environment, EPA & Energy Policy'),
 (0.9414893617021277, u'Russia & Ukraine'),
 (0.9473684210526316, u'Government Spending'),
 (0.9487603305785125, u'Iraq, Syria & ISIS'),
 (1.0076687116564418, u'Crime & Criminal Justice'),
 (1.0358592692828148, u'The Economy'),
 (1.0597014925373134, u'Trade'),
 (1.0762331838565022, u'Religion'),
 (1.1081081081081081, u'Israel & Palestine'),
 (1.1111111111111112, u'China'),
 (1.1115879828326178, u'Education'),
 (1.1164383561643836, u'Immigration'),
 

In [84]:
non_outliers = [c for c,t in topics_by_fips.iteritems() if l2norm(t,national_topics)<20]
share_array = numpy.array([[topics_by_fips[c][t] for t in topics] for c in non_outliers])

In [85]:
pca = sklearn.decomposition.PCA(n_components=2)
pcad = pca.fit_transform(share_array)

data = [plotly.graph_objs.Scatter(
    x=pcad[:,0],
    y=pcad[:,1],
    text=[name_by_fips[x].decode('utf-8','ignore') for x in non_outliers],
    marker={'color':["red" if "Alabama" in name_by_fips[x] else "blue" for x in non_outliers]},
    mode = 'markers'
)]
layout = {
    'hovermode':'closest'
}

fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig)

In [67]:
o = [0 for i in xrange(32)]
x = [0 for i in xrange(32)]
y = [0 for i in xrange(32)]
x[0]=1
y[1]=1
pca.inverse_transform([y])-pca.inverse_transform([o])

array([[ 0.3780921 , -0.22779477, -0.73293213, -0.0406153 ,  0.32654345,
         0.14169389, -0.00181568, -0.10488187, -0.15182312, -0.03477542,
         0.03354955,  0.19286954,  0.09108645,  0.13297806,  0.1264966 ,
         0.04174307,  0.0778951 ,  0.01886383,  0.03666358, -0.02532336,
        -0.01163777,  0.01440576,  0.06612379, -0.02620954,  0.05642378,
        -0.04052577,  0.02025383,  0.01257665,  0.02814426,  0.03363559,
        -0.03405212,  0.00304777]])

In [110]:
[sum(x.values()) for x in topics_by_fips.values()]

[150.8,
 146.86,
 149.34,
 140.74,
 149.82999999999998,
 141.40999999999997,
 148.06000000000003,
 149.08000000000004,
 151.44000000000003,
 147.38,
 158.3,
 146.05,
 141.35000000000002,
 148.58999999999997,
 146.85000000000002,
 144.33000000000004,
 147.28999999999994,
 188.88000000000002,
 156.36999999999998,
 149.82999999999998,
 141.39000000000007,
 146.38000000000005,
 143.59000000000003,
 150.56000000000006,
 146.70000000000002,
 143.57,
 148.58000000000004,
 144.79000000000002,
 144.70000000000005,
 153.99999999999997,
 142.32999999999998,
 145.47999999999996,
 149.57000000000002,
 151.04,
 150.37999999999997,
 186.00999999999996,
 142.06,
 147.32999999999998,
 148.50000000000003,
 157.48,
 145.92000000000002,
 148.65999999999997,
 148.33999999999995,
 138.01,
 146.82,
 144.06000000000003,
 142.10999999999999,
 146.96,
 166.66000000000003,
 146.26,
 141.96,
 139.68999999999997,
 145.03000000000003,
 149.61999999999998,
 145.11000000000004,
 144.56000000000003,
 141.6200000000000