### Extract Features from non-Tweets

#### Author: Lauren Thomas
#### Created: 16/07/2021
#### Last updated: 18/07/2021

###### File description: This file extracts features for the ML model from the non-Twitter data collected and pre-processed in clean_data


In [1]:
import os
import pickle

import pandas as pd

from os import sep
from patsy import dmatrices
import statsmodels.api as sm

C:\Users\ltswe\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\ltswe\anaconda3\lib\site-packages\numpy\.libs\libopenblas.QVLO2T66WEPI7JZ63PS3HMOHFEY472BC.gfortran-win_amd64.dll


In [2]:
cwd = f"C:{sep}Users{sep}ltswe{sep}Dropbox{sep}Oxford{sep}Thesis"
data_dir = "D:"

In [3]:
def drop_non_nyc(raw_df):
    return raw_df[(raw_df['STATE'] == 'New York') & ((raw_df['COUNTY'] == 'Bronx County') | 
                                              (raw_df['COUNTY'] == 'New York County') |
                                              (raw_df['COUNTY'] == 'Kings County') |
                                              (raw_df['COUNTY'] == 'Queens County') |
                                              (raw_df['COUNTY'] == 'Richmond County'))].reset_index(drop=True)

In [5]:
# Bring in the socioeconomic features & total pop counts from 2010-2014

# Import 2010-2014 data estimates 
est_1014_raw = pd.read_csv(f"{cwd}{sep}data{sep}2010_2014_estimates.csv", encoding = 'latin-1')

In [6]:
# Get rid of data outside NYC 
est_1014 = drop_non_nyc(est_1014_raw)

# Rename var to total pop
est_1014['total_pop_1014'] = est_1014['ABA1E001']

# Fix fips code
# Make a more understandable fips code (11 digits instead of 13)
def gen_fips_from_gisjoin(x):
    ''' x = GISJOIN code '''
    # x[4:7] is county code, x[8:14] = census tract code ('36' is always state code)
    return "36" + x[4:7] + x[8:14]

est_1014['LocationCT'] = est_1014['GISJOIN'].apply(lambda x: gen_fips_from_gisjoin(x))

# Keep only relevant var (fips code and total pop)
est_1014 = est_1014[['LocationCT', 'total_pop_1014']]

In [7]:
# Bring in data from 2006_2010_estimates_p2 to calc unemployment
est_0610_raw = pd.read_csv(f"{cwd}{sep}data{sep}2006_2010_estimates_p2.csv", encoding = 'latin-1')

# Get rid of data outside NYC 
est_0610 = drop_non_nyc(est_0610_raw)

# Find unemployed
est_0610['%_unemployed_0610'] = (est_0610['J6QE008'] + est_0610['J6QE015'] + 
                           est_0610['J6QE022'] + est_0610['J6QE029'] + est_0610['J6QE036'] 
                           + est_0610['J6QE043'] + est_0610['J6QE050'] + est_0610['J6QE057']
                           + est_0610['J6QE064'] + est_0610['J6QE076'] + est_0610['J6QE071']
                           + est_0610['J6QE081'] + est_0610['J6QE086'] + est_0610['J6QE094']
                           + est_0610['J6QE101'] + est_0610['J6QE108'] + est_0610['J6QE115']
                           + est_0610['J6QE122'] + est_0610['J6QE129']
                           + est_0610['J6QE136'] + est_0610['J6QE143'] + est_0610['J6QE150'] 
                           + est_0610['J6QE157'] + est_0610['J6QE162'] + est_0610['J6QE167']
                           + est_0610['J6QE172'])/(est_0610['J6QE004'] + est_0610['J6QE011'] + 
                           est_0610['J6QE018'] + est_0610['J6QE025'] + est_0610['J6QE032'] 
                           + est_0610['J6QE039'] + est_0610['J6QE046'] + est_0610['J6QE053']
                           + est_0610['J6QE060'] + est_0610['J6QE067'] + est_0610['J6QE074']
                           + est_0610['J6QE079'] + est_0610['J6QE084'] + est_0610['J6QE090']
                           + est_0610['J6QE097'] + est_0610['J6QE104'] + est_0610['J6QE111']
                           + est_0610['J6QE118'] + est_0610['J6QE125']
                           + est_0610['J6QE132'] + est_0610['J6QE139'] + est_0610['J6QE146'] 
                           + est_0610['J6QE153'] + est_0610['J6QE160'] + est_0610['J6QE165']
                           + est_0610['J6QE170'])

est_0610['LocationCT'] = est_0610['GISJOIN'].apply(lambda x: gen_fips_from_gisjoin(x))

# Keep only relevant var (fips code and total pop)
est_0610 = est_0610[['LocationCT', '%_unemployed_0610']]

In [8]:
# Bring in total pop data
pop_0610_raw = pd.read_csv(f"{cwd}{sep}data{sep}2006_2010_pop.csv", encoding = 'latin-1')
pop_0610 = drop_non_nyc(pop_0610_raw)
pop_0610['total_pop_0610'] = pop_0610['JMAE001']

pop_0610['LocationCT'] = pop_0610['GISJOIN'].apply(lambda x: gen_fips_from_gisjoin(x))


pop_0610 = pop_0610[['LocationCT', 'total_pop_0610']]

In [9]:
# Unpickle socioeconomic data
socio_df = pickle.load(open(f'{data_dir}{sep}pickle{sep}all_socioeconomic.pickle', 'rb'))
socio_df['LocationCT'] = socio_df['fips_code']

In [10]:
# Combine socioecon df
socio_merged = socio_df.merge(est_1014.merge(est_0610.merge(pop_0610, on='LocationCT'), on = 'LocationCT'), on='LocationCT')

# Pickle
pickle.dump(socio_merged, open(f'{data_dir}{sep}pickle{sep}socio_merged.pickle', 'wb'))

#### 311 Data

In [None]:
# # Unpickle 311 data set
# nyc_311 = pickle.load(open(f"{data_dir}{sep}pickle{sep}nyc_311_gdf.pickle", 'rb'))


In [None]:
# # Replace the various noise categories with just Noise
# nyc_311 = nyc_311.replace(to_replace = ['Noise - Street/Sidewalk', 'Noise - Vehicle', 'Noise - Residential',
#                                        'Noise - Park', 'Noise - House of Worship'], 
#                           value=['Noise', 'Noise', 'Noise', 'Noise', 'Noise'])

In [None]:
# # More detailed info about the relevant complaints 
# nyc_311[(nyc_311['complaint_type'] == 'Noise') |
#                        (nyc_311['complaint_type'] == 'For Hire Vehicle Complaint') |
#                        (nyc_311['complaint_type'] == 'Bike Rack Condition')]['descriptor'].unique()

In [None]:
# # 311 data: Collapse # of noise/for hire vehicle/bike rack complaints into year-month-census tract
# nyc_311_count = nyc_311[(nyc_311['complaint_type'] == 'Noise') |
#                        (nyc_311['complaint_type'] == 'For Hire Vehicle Complaint') |
#                        (nyc_311['complaint_type'] == 'Bike Rack Condition')] \
#                         .groupby(['year', 'month', 'LocationCT', 'complaint_type']).count()['unique_key']\
#                         .reset_index().rename(columns={'unique_key': 'count'})
# # Pickle
# pickle.dump(nyc_311_count, open(f'{data_dir}{sep}pickle{sep}nyc_311_count.pickle', 'wb'))

In [19]:
nyc_311_count = pickle.load(open(f'{data_dir}{sep}pickle{sep}nyc_311_count.pickle', 'rb'))

In [20]:
nyc_311_1113 = nyc_311_count[(nyc_311_count['year'] == '2011')| (nyc_311_count['year'] == '2012') |
                            (nyc_311_count['year'] == '2013')].copy()


In [21]:
# Merge in pop data to calc pc
# Merge in population for 2006-2010 & 10-14 from socio_df
pop = socio_merged[['LocationCT', 'total_pop_1014']]
nyc_311_1113 = nyc_311_1113.merge(pop, on = 'LocationCT')

In [22]:
nyc_311_1113['complaints_pc'] = nyc_311_1113['count']/nyc_311_1113['total_pop_1014']

In [23]:
# Create year-month var
nyc_311_1113['ym'] = nyc_311_1113.year.str.cat(nyc_311_1113.month.apply(lambda x: ("0"+x)[-2:]),sep='-')

# Make year-month into numbers and let it equal number of months since Jan 2006 (starting at 60)
nyc_311_1113['ym_num'] = nyc_311_1113['ym'].apply(lambda x: (int(x[0:4])-2006)*12+int(x[-2:])-1)


In [24]:
# Create list of all ym_num and tracts. Recall ym_num starts at 60 and goes till 96 (3 years)
ym_num_list_311 = [i for i in range(60,96)]
tract_list_311 = nyc_311_1113['LocationCT'].unique()

In [25]:
#Any missing ym-tract-levels not in dataframe should be added in with a '0' count
# Create a function to do this with the given df
def replace_missing(df, tract_list, ym_num_list, crime_df = True):
    missing_tracts, missing_ym_num, missing_levels = [],[],[] 
    # Level = futher thing to separate by, e.g, level of crime or type of complaint
    if crime_df == True:
        level_list = ['FELONY', 'MISDEMEANOR', 'VIOLATION']
        level_str = 'level'
    else:
        level_list = ['Noise', 'For Hire Vehicle Complaint', 'Bike Rack Condition']
        level_str = 'complaint_type'
    # Convert all the currently existing tracts/ym/level combos into a dictionary
    existing_dict = df.groupby('LocationCT')[['ym_num', level_str, 'count']].apply(lambda x: x.set_index(['ym_num', level_str])\
                                                        .to_dict()).to_dict()
    
    # First, go through and find the missing ym-tract-level combos
    for tract in tract_list: 
        for ym_num in ym_num_list:
            for level in level_list:
                # If the following returns a Key Error, then it's not in the existing dict (missing)
                try: 
                    existing_dict[tract]['count'][ym_num, level]
                except KeyError:
                    missing_tracts.append(tract)
                    missing_ym_num.append(ym_num)
                    missing_levels.append(level)
    
    # Next, create a list = len of tracts/ym_num/levels. crime or complaint type pc is 0 for all of these
    pc_list = [0.0 for i in range(len(missing_tracts))]
    
    # Now, create a dataframe tht we will append onto original df
    new_df = pd.DataFrame()
    new_df['LocationCT'] = missing_tracts
    new_df[level_str] = missing_levels
    if crime_df == True:
        new_df['crime_pc'] = pc_list
    else:
        new_df['complaints_pc'] = pc_list
    new_df['ym_num'] = missing_ym_num
    
    full_df = df.append(new_df)
    
    return full_df
    

In [26]:
nyc_311_1113_full = replace_missing(nyc_311_1113, tract_list_311, ym_num_list_311, crime_df=False)

In [27]:
# Use nyc_311_1113_full to find the trend line
# Create a dict where key = LocationCT, value = trend (from linear regression) for each complaint type.
# Create a function that when given a df & LocationCT, calculates the line of best fit and puts it in the dict
def calc_trend(df, LocationCT, trend_dict, complaint_type):
    # Filter based on census tract
    # Calc line of best fit
    y,X = dmatrices('complaints_pc ~ ym_num', data=df[(df['complaint_type'] == complaint_type) & (df['LocationCT'] == LocationCT)], return_type='dataframe')
    model = sm.OLS(y,X)
    # Fit model
    res = model.fit()
    # Put OLS line slope into trend dict
    trend_dict[LocationCT] = res.params[1]

In [28]:
# Calc trend for each type of complaint & location
def calc_trend_complaint(complaint_type, trend_dict):
    for tract in tract_list_311:
        calc_trend(nyc_311_1113_full, tract, trend_dict, complaint_type)
        

In [29]:
noise_trend_dict = dict()
fhv_trend_dict = dict()
bike_trend_dict = dict()

calc_trend_complaint('Noise', noise_trend_dict)
calc_trend_complaint('For Hire Vehicle Complaint', fhv_trend_dict)
calc_trend_complaint('Bike Rack Condition', bike_trend_dict)


In [30]:
# Turn features into dataframe and merge
df1 = pd.DataFrame.from_dict(noise_trend_dict, 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'noise_1113_trend'})
df2 = pd.DataFrame.from_dict(fhv_trend_dict, 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'fhv_1113_trend'})
df3 = pd.DataFrame.from_dict(bike_trend_dict, 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'bike_1113_trend'})

features_311 = df1.merge(df2.merge(df3, on='LocationCT'), on='LocationCT')

# Pickle features
pickle.dump(features_311, open(f'{data_dir}{sep}pickle{sep}features_311.pickle', 'wb'))

#### Crime

In [4]:
# # Unpickle nyc crime
# nyc_crime = pickle.load(open(f'{data_dir}{sep}pickle{sep}nyc_crime_gdf.pickle', 'rb'))

In [5]:
# # Drop the rows that contain no values for pd_desc or class (or 4 that contain no value for 'completed')
# nyc_crime = nyc_crime.dropna()

In [6]:
# # Features
# # Collapse crime into year-month-census tracts-level 
# nyc_crime_count = nyc_crime.groupby(['year', 'month', 'LocationCT', 'level']).count()['cmplnt_num']\
#                     .reset_index().rename(columns={'cmplnt_num': 'count'})

In [7]:
# pickle.dump(nyc_crime_count, open(f'{data_dir}{sep}pickle{sep}nyc_crime_count.pickle', 'wb'))

In [31]:
nyc_crime_count = pickle.load(open(f'{data_dir}{sep}pickle{sep}nyc_crime_count.pickle', 'rb'))

In [32]:
# Merge in population for 2006-2010 & 10-14 from socio_df
pop = socio_merged[['LocationCT', 'total_pop_1014', 'total_pop_0610']]
nyc_crime_count = nyc_crime_count.merge(pop, on = 'LocationCT')


##### 2006-2010

In [33]:
# Average crime pc 2006-2010
nyc_crime_0610 = nyc_crime_count[(nyc_crime_count['year'] == "2006") |(nyc_crime_count['year'] == "2007") | (nyc_crime_count['year'] == "2008") | (nyc_crime_count['year'] == "2009")
                                | (nyc_crime_count['year'] == "2010")].reset_index(drop=True)
nyc_crime_0610['crime_pc'] = nyc_crime_0610['count']/nyc_crime_0610['total_pop_0610']

# Create year-month var
nyc_crime_0610['ym'] = nyc_crime_0610.year.str.cat(nyc_crime_0610.month.apply(lambda x: ("0"+x)[-2:]),sep='-')

# Make year-month into numbers and let it equal number of months since Jan 2006 (starting at 0)
nyc_crime_0610['ym_num'] = nyc_crime_0610['ym'].apply(lambda x: (int(x[0:4])-2006)*12+int(x[-2:])-1)


In [34]:
# Create list of all ym_num and tracts
ym_num_list = [i for i in range(60)]
tract_list = nyc_crime_0610['LocationCT'].unique()

In [35]:
# Use replace missing function from 311 section to add in 0s for the 
nyc_crime_0610_full = replace_missing(nyc_crime_0610, tract_list, ym_num_list)

In [36]:
# Collapse to get sum of the various crimes & then average the months/years
nyc_crime_0610_tot = nyc_crime_0610_full.groupby(['ym_num', 'LocationCT']).sum()['crime_pc'].reset_index()
nyc_crime_0610_avg = nyc_crime_0610_tot.groupby(['LocationCT']).mean().reset_index()

In [37]:
# Use nyc_crime_tot to find the trend line
# Create a dict where key = LocationCT, value = trend (from linear regression).
# Create a function that when given a df & LocationCT, calculates the line of best fit and puts it in the dict
def calc_trend(df, LocationCT, trend_dict):
    # Filter based on census tract
    # Calc line of best fit
    y,X = dmatrices('crime_pc ~ ym_num', data=df[df['LocationCT'] == LocationCT], return_type='dataframe')
    model = sm.OLS(y,X)
    # Fit model
    res = model.fit()
    # Put OLS line slope into trend dict
    trend_dict[LocationCT] = res.params[1]

In [38]:
crime_0610_trend_dict = dict()

# Now, run through all the census tracts to calc line of best fit
for tract in tract_list:
    calc_trend(nyc_crime_0610_tot, tract, crime_0610_trend_dict)

In [39]:
# Separate into felonies only from nyc_crime_0710. 
nyc_fel_0610 = nyc_crime_0610_full[nyc_crime_0610_full['level'] == 'FELONY']
# Get average
nyc_fel_0610_avg = nyc_fel_0610.groupby(['LocationCT']).mean().reset_index()

In [40]:
# Find the trend line using nyc_fel_0610
fel_0610_trend_dict = dict()
for tract in tract_list:
    calc_trend(nyc_fel_0610, tract, fel_0610_trend_dict)

##### 2011-2013

In [41]:
# Average crime pc 2011-2013
nyc_crime_1113 = nyc_crime_count[(nyc_crime_count['year'] == "2011") |
                                 (nyc_crime_count['year'] == "2012") | 
                                 (nyc_crime_count['year'] == "2013")].reset_index(drop=True)

nyc_crime_1113['crime_pc'] = nyc_crime_1113['count']/nyc_crime_1113['total_pop_1014']

# Create year-month var
nyc_crime_1113['ym'] = nyc_crime_1113.year.str.cat(nyc_crime_1113.month.apply(lambda x: ("0"+x)[-2:]),sep='-')

# Make year-month into numbers and let it equal number of months since Jan 2006 (starting at 60 for Jan 2011)
nyc_crime_1113['ym_num'] = nyc_crime_1113['ym'].apply(lambda x: (int(x[0:4])-2006)*12+int(x[-2:])-1)

# Create list of all ym_num and tracts
ym_num_1113_list = [i for i in range(60,96)]
tract_list = list(nyc_crime_1113['LocationCT'].unique())

In [42]:
# Replace missing using above function
nyc_crime_1113_full = replace_missing(nyc_crime_1113, tract_list, ym_num_1113_list)

# Collapse to get sum of the various crimes & then average the months/years
nyc_crime_1113_tot = nyc_crime_1113_full.groupby(['ym_num', 'LocationCT']).sum()['crime_pc'].reset_index()
nyc_crime_1113_avg = nyc_crime_1113_tot.groupby(['LocationCT']).mean().reset_index()[['LocationCT', 'crime_pc']]

# Find trend lines for crimes in 2011-2013
crime_1113_trend_dict = dict()

# Now, run through all the census tracts to calc line of best fit
for tract in tract_list:
    calc_trend(nyc_crime_1113_tot, tract, crime_1113_trend_dict)

In [43]:
# Separate into felonies only from nyc_crime_1113. 
nyc_fel_1113 = nyc_crime_1113_full[nyc_crime_1113_full['level'] == 'FELONY']
# Get average
nyc_fel_1113_avg = nyc_fel_1113.groupby(['LocationCT']).mean().reset_index()

# Find the trend line using nyc_fel_1113
fel_1113_trend_dict = dict()
for tract in tract_list:
    calc_trend(nyc_fel_1113, tract, fel_1113_trend_dict)

In [44]:
# Create a dataframe with LocationCT & all the other relevant crime vars
# First, turn the trend lines (crime0610, crime1113, fel0610, fel1113) into dataframes
df1 = pd.DataFrame.from_dict(crime_0610_trend_dict, 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'crime_0610_trend'})

df2 = pd.DataFrame.from_dict(crime_1113_trend_dict, 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'crime_1113_trend'})

df3 = pd.DataFrame.from_dict(fel_0610_trend_dict, 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'fel_0610_trend'})

df4 = pd.DataFrame.from_dict(fel_1113_trend_dict, 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'fel_1113_trend'})


trend_df = df1.merge(df2.merge(df3.merge(df4, on='LocationCT'), on='LocationCT'), on='LocationCT')


In [45]:
# Fix dataframes for the averages
nyc_crime_0610_avg = nyc_crime_0610_avg[['LocationCT', 'crime_pc']].rename(columns={'crime_pc': 'crime_pc_0610'})
nyc_crime_1113_avg = nyc_crime_1113_avg[['LocationCT', 'crime_pc']].rename(columns={'crime_pc': 'crime_pc_1113'})

nyc_fel_0610_avg = nyc_fel_0610_avg[['LocationCT', 'crime_pc']].rename(columns={'crime_pc': 'fel_pc_0610'})
nyc_fel_1113_avg = nyc_fel_1113_avg[['LocationCT', 'crime_pc']].rename(columns={'crime_pc': 'fel_pc_1113'})

In [46]:
# Now, merge the various per-capita averages
crime_features = trend_df.merge(nyc_crime_0610_avg.merge(nyc_crime_1113_avg.merge(nyc_fel_0610_avg.merge(nyc_fel_1113_avg, 
                                on='LocationCT'), on='LocationCT'), on='LocationCT'), on='LocationCT')

# Pickle crime features
pickle.dump(crime_features, open(f'{data_dir}{sep}pickle{sep}crime_features.pickle', 'wb'))

#### HUD Features

In [47]:
# Bring in HUD dataset
ny_hud = pickle.load(open(f"{data_dir}{sep}pickle{sep}nyc_hud.pickle", "rb"))

In [48]:
# Sep county codes and keep only NYC counties (061, 005, 081, 085, 047)
ny_hud['county_code'] = ny_hud['fips_code'].apply(lambda x: x[2:5])
nyc_hud = ny_hud[(ny_hud['county_code'] == '061') | (ny_hud['county_code'] == '005') | (ny_hud['county_code'] == '081') |
                (ny_hud['county_code'] == '085') | (ny_hud['county_code'] == '047')].copy()

In [49]:
# Total # of commerical addresses, total # of res addresses, total # of vacant addresses (res and comm) ST, total 
# num of res/comm addresses LT
nyc_hud['total_res'] = nyc_hud['AMS_RES']
nyc_hud['total_comm'] = nyc_hud['AMS_BUS']
nyc_hud['total_vacant_res'] = nyc_hud['RES_VAC']
nyc_hud['total_vacant_comm'] = nyc_hud['BUS_VAC']
nyc_hud['avg_vac_res'] = nyc_hud['AVG_VAC_R']
nyc_hud['avg_vac_comm'] = nyc_hud['AVG_VAC_B']

nyc_hud['total_vacant_ST'] = nyc_hud['VAC_3_RES'] + nyc_hud['VAC_3_6_R'] + nyc_hud['VAC_3_BUS'] + nyc_hud['VAC_3_6_B']
nyc_hud['total_res_vacant_ST'] = nyc_hud['VAC_3_RES'] + nyc_hud['VAC_3_6_R']
nyc_hud['total_comm_vacant_ST'] =nyc_hud['VAC_3_BUS'] + nyc_hud['VAC_3_6_B']

nyc_hud['total_res_vacant_LT']  = nyc_hud['VAC_6_12R'] +  nyc_hud['VAC_12_24R'] + nyc_hud['VAC_24_36R'] + nyc_hud['VAC_36_RES']
nyc_hud['total_comm_vacant_LT']= nyc_hud['VAC_6_12B'] +  nyc_hud['VAC_12_24B'] + nyc_hud['VAC_24_36B'] + nyc_hud['VAC_36_BUS']
nyc_hud['total_vacant_LT'] = nyc_hud['total_res_vacant_LT'] + nyc_hud['total_comm_vacant_LT']

nyc_hud_full = nyc_hud[['fips_code', 'year', 'month','total_res', 'total_comm', 'total_vacant_res', 'total_vacant_comm',
                       'avg_vac_res', 'avg_vac_comm', 'total_vacant_ST', 'total_res_vacant_ST',
                       'total_comm_vacant_ST', 'total_res_vacant_LT', 'total_comm_vacant_LT', 'total_vacant_LT']].copy()

In [50]:
# Gen ym variables
# Create year-month var
nyc_hud_full['ym'] = nyc_hud_full.year.str.cat(nyc_hud_full.month.apply(lambda x: ("0"+x)[-2:]),sep='-')

# Make year-month into numbers and let it equal number of months since Jan 2006 (starting at 77 - 2012-06)
nyc_hud_full['ym_num'] = nyc_hud_full['ym'].apply(lambda x: (int(x[0:4])-2006)*12+int(x[-2:])-1)

In [51]:
nyc_hud_tract_list = nyc_hud_full['fips_code'].unique()

In [52]:
# Use nyc_hud_full to find the trend line
# Create a dict where key = LocationCT, value = trend (from linear regression).
# Create a function that when given a df & LocationCT, calculates the line of best fit and puts it in the dict
def calc_trend(df, fips_code, trend_dict, var_str):
    # Filter based on census tract
    # Calc line of best fit
    y,X = dmatrices(f'{var_str} ~ ym_num', data=df[df['fips_code'] == fips_code], return_type='dataframe')
    model = sm.OLS(y,X)
    # Fit model
    res = model.fit()
    # Put OLS line slope into trend dict
    trend_dict[var_str][fips_code] = res.params[1]

In [53]:
# Find trend lines for crimes in 2011-2013
nyc_hud_trend_dict = dict()
var_str_list = ['total_comm', 'total_res', 'total_res_vacant_ST', 'total_comm_vacant_ST', 'total_res_vacant_LT', 'total_comm_vacant_LT']

# Now, run through all the census tracts to calc line of best fit
for var_str in var_str_list:
    nyc_hud_trend_dict[var_str] = dict()
    for tract in nyc_hud_tract_list:
        calc_trend(nyc_hud_full, tract, nyc_hud_trend_dict, var_str)

In [54]:
df1 = pd.DataFrame.from_dict(nyc_hud_trend_dict['total_comm'], 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'total_comm_trend'})

df2 = pd.DataFrame.from_dict(nyc_hud_trend_dict['total_res'], 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'total_res_trend'})

df3 = pd.DataFrame.from_dict(nyc_hud_trend_dict['total_res_vacant_ST'], 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'total_res_vacant_ST_trend'})

df4 = pd.DataFrame.from_dict(nyc_hud_trend_dict['total_comm_vacant_ST'], 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'total_comm_vacant_ST_trend'})

df5 = pd.DataFrame.from_dict(nyc_hud_trend_dict['total_res_vacant_LT'], 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'total_res_vacant_LT_trend'})

df6 = pd.DataFrame.from_dict(nyc_hud_trend_dict['total_comm_vacant_LT'], 
                             orient='index').reset_index().rename(columns={'index': 'LocationCT', 0:'total_comm_vacant_LT_trend'})



In [55]:
hud_features = df1.merge(df2.merge(df3.merge(df4.merge(df5.merge(df6, on='LocationCT'), on='LocationCT'), on='LocationCT'), on='LocationCT'), on='LocationCT')

# Pickle features
pickle.dump(hud_features, open(f'{data_dir}{sep}pickle{sep}hud_features.pickle', 'wb'))

##### Merge all features together

In [16]:
# Unpickle relevant dataframes
socio_merged = pickle.load(open(f'{data_dir}{sep}pickle{sep}socio_merged.pickle', 'rb'))
hud_features = pickle.load(open(f'{data_dir}{sep}pickle{sep}hud_features.pickle', 'rb'))
crime_features = pickle.load(open(f'{data_dir}{sep}pickle{sep}crime_features.pickle', 'rb'))
features_311 = pickle.load(open(f'{data_dir}{sep}pickle{sep}features_311.pickle', 'rb'))

In [17]:
socio_merged.columns

Index(['GISJOIN', 'STATE', 'COUNTY', '%_black_0610', '%_white_0610',
       '25plus_pop_0610', '%_bachelors_0610', '%_nonwhite_0610',
       '%_hhrent_0610', '%_li_0610', 'med_hh_income_0610', 'med_rent_0610',
       'med_home_value_0610', '25plus_pop_1418', '%_bachelors_1418',
       'med_hh_income_1418', 'med_rent_1418', 'med_home_value_1418',
       'change_rent', 'change_home_value', 'eligible_gentrify', 'hot_market',
       'change_college', 'change_hh_inc', 'gentrification', 'fips_code',
       'LocationCT', 'total_pop_1014', '%_unemployed_0610', 'total_pop_0610'],
      dtype='object')

In [18]:
non_twitter_features = socio_merged.merge(hud_features.merge(crime_features.merge(features_311, on='LocationCT'), on = 'LocationCT'), on='LocationCT')

In [19]:
pickle.dump(non_twitter_features, open(f'{data_dir}{sep}pickle{sep}non_twitter_features.pickle', 'wb'))