In [None]:
import pandas as pd
import numpy as np

In [None]:
# write lat longs to csv
voterlatlong = pd.read_csv('voter_address_lat_long.csv', sep='\t')

In [None]:
voterlatlong.head()
# voterlatlong[voterlatlong.VOTER_ID==300427492]

In [None]:
# drop dates, no longer needed, since we are only looking where they moved to, actually don't need year either
voterlatlong['moveyear'] = pd.to_numeric(voterlatlong.dates.apply(lambda x: x.split('-')[0]), downcast='integer')
voterlatlong = voterlatlong.drop(['dates'], axis=1)

In [None]:
voterlatlong.head()

In [None]:
# keep only the first time a new address appears
voterlatlong = voterlatlong.drop_duplicates(subset=['VOTER_ID', 'addresses'],keep='first')

In [None]:
# force to numeric
voterlatlong['Lat'] = pd.to_numeric(voterlatlong['Lat'], downcast='float', errors='coerce')
voterlatlong['Long'] = pd.to_numeric(voterlatlong['Long'], downcast='float', errors='coerce')

In [None]:
# drop all nan rows
voterlatlong = voterlatlong.dropna(how='any')

In [None]:
# need to get age
voterages = pd.read_csv('voter_info.csv', sep='\t')
voterages.head()

In [None]:
voterages = voterages[['VOTER_ID', 'year']]

In [None]:
voterlatlong = voterlatlong.join(voterages.set_index('VOTER_ID'), on='VOTER_ID')
voterlatlong.head()

In [None]:
# filter out the voters with less than 1 row
voterlatlong = voterlatlong[voterlatlong.duplicated(subset=['VOTER_ID'], keep=False)]

In [None]:
#!/usr/bin/env python

# Haversine formula example in Python
# Author: Wayne Dyck

import math

def f(x):
    return math.atan2(x[0], x[1])

def distance(lat1, lon1, destination):
#     lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = (lat2-lat1).apply(math.radians)
    dlon = (lon2-lon1).apply(math.radians)
    tlat1 = lat1.apply(math.radians)
    a = (dlat/2).apply(math.sin) * (dlat/2).apply(math.sin) + tlat1.apply(math.cos) \
        * math.cos(math.radians(lat2)) * (dlon/2).apply(math.sin) * (dlon/2).apply(math.sin)

    df = pd.DataFrame({'a':a.apply(math.sqrt), 'b':(1-a).apply(math.sqrt)})
    c = 2 * df.apply(f, axis=1)
    d = radius * c
    # convert to miles: 1 km = 0.621371 miles
    d = d*0.621371

    return d

In [None]:
# find distance from center (burnside bridge)
center = [45.523049 , -122.667385]

# calculate distance (miles) from center for each person
voterlatlong['distcenter'] = distance(voterlatlong['Lat'], voterlatlong['Long'], center)

In [None]:
voterlatlong[voterlatlong['distcenter']<0.1].addresses

In [None]:
# throw out less than 0.1 miles (applied to the wrong column)
voterlatlong['distcenter'].min()
voterlatlong = voterlatlong[voterlatlong['distcenter']>0.1]
voterlatlong.head()

In [None]:
# need the following:
# column for the displacement from previous address
# column for the angle from the previous to new address
# column for x coord based on previous address (trying to answer whether peopple are moving away or towards center)
# column for y coord based on previous address (trying to answer whether peopple are moving away or towards center)
# column for age at the time of move

# this is correct, there are in fact some individuals with ages below 18 at the time of relocation
voterlatlong['relocationage'] = voterlatlong.moveyear - voterlatlong.year
# voterlatlong[voterlatlong.relocationage<18]

In [None]:
voterlatlong.head()

In [None]:
# center all addresses to the city center (burnside bridge)
voterlatlong['cLat'] = voterlatlong.Lat - center[0]
voterlatlong['cLong'] = voterlatlong.Long - center[1]
voterlatlong.head()

In [None]:
# df has age at move, dist away or toward center, and angle
df = pd.DataFrame()
voters = voterlatlong.VOTER_ID.unique()
voters

In [None]:
from ipywidgets import FloatProgress
from IPython.display import display

In [None]:
temp = voterlatlong[voterlatlong.VOTER_ID==voters[1]]
temp.sort_values(by='moveyear', ascending=True)

In [None]:
max_count = len(voters)

fb = FloatProgress(min=0, max=max_count) # instantiate the bar
display(fb) # display the bar

# for each voter, shove into the above df
for voter in voters:
    # signal to increment the progress bar
    fb.value += 1
    
    # pull out individuals
    temp = voterlatlong[voterlatlong.VOTER_ID == voter]
    temp2 = temp.copy()
    # sort by move year (oldest to newest)
    temp = temp.sort_values(by='moveyear', ascending=True)
    # calculate the distance and angle between the previous and the next address
    temp2['consecdist'] = temp.distcenter.diff()
    temp2['radians'] = np.arctan2(temp['cLat'], temp['cLong'])
    df = df.append(temp2)

In [None]:
df.head()

In [None]:
# need x and y coordinates from consecdist and radians
df['x'] = df.consecdist*np.cos(df.radians)
df['y'] = df.consecdist*np.sin(df.radians)

In [None]:
df.head()

In [None]:
df = df.rename(columns={'year':'birthyear'})

In [None]:
# dump into csv, took way too long to get this
df.to_csv('move_dist_angle_df.csv', sep='\t')

In [None]:
df.head()

In [None]:
df2 = df[['relocationage', 'consecdist', 'x', 'y', 'cLat', 'cLong', 'moveyear', 'radians']]
df2.head()

In [None]:
df2 = df2.dropna()
df2.head()

In [None]:
df2['degrees'] = np.mod((df2.radians + 2*np.pi)*180/np.pi, 360)

In [None]:
df2 = df2.drop('radians', axis=1)

In [None]:
df2.head()

In [None]:
# dump into csv, took way too long to get this
df2.to_csv('subset_move_dist_angle_df.csv', sep='\t', index=False)

In [None]:
df2 = pd.read_csv('subset_move_dist_angle_df.csv', sep='\t')

In [None]:
mask = abs(df2.consecdist)<0.1
df2 = df2[~mask]

In [None]:
import matplotlib.pyplot as plt

In [None]:
df = df2.copy()
df = df.dropna(how='any')
age_groups = [18, 26, 33, 40, 50, 150];
# group by age
df['age_group'] = 0
for i in range(0, len(age_groups) - 1):
    # assign age group category for edf.currageon
    df.loc[(df.relocationage>=age_groups[i]) & (df.relocationage<age_groups[i+1]), 'age_group'] = age_groups[i]

In [None]:
df2.head()

In [None]:
temp = df2.loc[df2.age_group==18,'consecdist']
plt.hist(temp, bins=50, range=[-15, 15])
plt.show()
print('negative: {}'.format(sum(temp<0)/temp.shape[0]))
print('positive: {}'.format(sum(temp>0)/temp.shape[0]))
print(temp.describe())

In [None]:
temp = df2.loc[df2.age_group==26,'consecdist']
plt.hist(temp, bins=50, range=[-15, 15])
plt.show()
temp.describe()
print('negative: {}'.format(sum(temp<0)/temp.shape[0]))
print('positive: {}'.format(sum(temp>0)/temp.shape[0]))
print(temp.describe())

In [None]:
temp = df2.loc[df2.age_group==33,'consecdist']
plt.hist(temp, bins=50, range=[-5, 5])
plt.show()
temp.describe()
print('negative: {}'.format(sum(temp<0)/temp.shape[0]))
print('positive: {}'.format(sum(temp>0)/temp.shape[0]))
print(temp.describe())

In [None]:
temp = df2.loc[df2.age_group==40,'consecdist']
plt.hist(temp, bins=50, range=[-5, 5])
plt.show()
temp.describe()
print('negative: {}'.format(sum(temp<0)/temp.shape[0]))
print('positive: {}'.format(sum(temp>0)/temp.shape[0]))
print(temp.describe())

In [None]:
temp = df2.loc[df2.age_group==50,'consecdist']
plt.hist(temp, bins=50, range=[-15, 15])
plt.show()
temp.describe()
print('negative: {}'.format(sum(temp<0)/temp.shape[0]))
print('positive: {}'.format(sum(temp>0)/temp.shape[0]))
print(temp.describe())

In [None]:
temp = df2.loc[(df2.age_group==18) & (df2.consecdist>0),'degrees']
plt.hist(temp, bins=50)
plt.show()

In [None]:
temp = df2.loc[(df2.age_group==18) & (df2.consecdist<0),'degrees']
plt.hist(temp, bins=50)
plt.show()

In [None]:
temp = df2.loc[(df2.age_group==26) & (df2.consecdist>0),'degrees']
plt.hist(temp, bins=50)
plt.show()

In [None]:
temp = df2.loc[(df2.age_group==26) & (df2.consecdist<0),'degrees']
plt.hist(temp, bins=50)
plt.show()

In [None]:
temp = df2.loc[(df2.age_group==33) & (df2.consecdist>0),'degrees']
plt.hist(temp, bins=50)
plt.show()

In [None]:
temp = df2.loc[(df2.age_group==33) & (df2.consecdist<0),'degrees']
plt.hist(temp, bins=50)
plt.show()

In [None]:
temp = df2.loc[(df2.age_group==50) & (df2.consecdist>0),'degrees']
plt.hist(temp, bins=50)
plt.show()
print(temp.describe())

In [None]:
temp = df2.loc[(df2.age_group==50) & (df2.consecdist<0),'degrees']
plt.hist(temp, bins=50)
plt.show()
print(temp.describe())