In [1]:
import pandas as pd
import numpy as np

import Clean_data as cld
import Transform_Data as tfd
import Visualize_Data as vld

In [2]:
df = pd.read_csv('Final_Data/Full_City_Lookup.csv')

In [3]:
df.columns

Index(['StateAbbreviation', 'State', 'Region', 'State_Rental_Tier',
       'Mean_Rental_Value', 'debt', 'rent_mean', 'rent_median', 'family_mean',
       'family_median', 'hc_mortgage_mean', 'hc_mortgage_median', 'hc_mean',
       'hc_median', 'hs_degree', 'hs_degree_male', 'hs_degree_female',
       'male_age_mean', 'male_age_median', 'female_age_mean',
       'female_age_median', 'hi_mean', 'hi_median', 'pct_own', 'married',
       'married_snp', 'separated', 'divorced', 'rent_gt_10', 'rent_gt_15',
       'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35', 'rent_gt_40',
       'rent_gt_50', 'home_equity', 'male_percent', 'female_percent',
       'rent_mean_to_median', 'family_mean_to_median',
       'hc_mortgage_mean_to_median', 'hc_mean_to_median',
       'Female_Degree_Holders', 'Male_Degree_Holders', 'Gender_Degree_Ratio',
       '25 and older', '25 to 39', '40 to 64', '65 and older', 'young_workers',
       'old_workers', 'Top_10_Mean_Pop', 'Bachelor's Degree Holders',
      

In [4]:
df.sort_values(by='State_Rental_Tier').head(3)

Unnamed: 0,StateAbbreviation,State,Region,State_Rental_Tier,Mean_Rental_Value,debt,rent_mean,rent_median,family_mean,family_median,...,Education,"Arts, Humanities and Others",sci_eng_pct,sci_eng_rel_pct,business_pct,education_pct,art_hum_ot_pct,Days,Price,house_market_score
37,AR,Arkansas,Southern,0.0,818.555556,0.548378,713.879645,676.640449,63939.53636,53052.25,...,172512.0,191170.0,0.292412,0.114655,0.210406,0.181451,0.201076,91.0,154700.0,1700.0
46,WY,Wyoming,Western,1.0,828.5,0.573169,874.457417,819.527778,81501.322932,72506.930556,...,45100.0,41906.0,0.365816,0.104291,0.146799,0.198579,0.184515,97.0,239400.0,2468.041237
47,WV,West Virginia,Southern,2.0,845.090909,0.477291,674.666645,641.875,65108.786482,54499.412879,...,111800.0,101842.0,0.293316,0.131284,0.179336,0.207263,0.188802,113.0,145300.0,1285.840708


In [5]:
df['State_Rental_Tier'] = df['State_Rental_Tier']+1

In [6]:
df.head(3)

Unnamed: 0,StateAbbreviation,State,Region,State_Rental_Tier,Mean_Rental_Value,debt,rent_mean,rent_median,family_mean,family_median,...,Education,"Arts, Humanities and Others",sci_eng_pct,sci_eng_rel_pct,business_pct,education_pct,art_hum_ot_pct,Days,Price,house_market_score
0,NY,New York,Northeastern,51.0,4105.638298,0.625454,1266.696603,1223.602505,88530.77709,78968.387474,...,1091084.0,2774078.0,0.346303,0.092848,0.186769,0.105598,0.268482,140.0,308800.0,2205.714286
1,CA,California,California,49.0,2436.20412,0.720963,1472.168818,1421.218025,88315.668424,78506.768148,...,1160732.0,4832134.0,0.416775,0.082876,0.182542,0.061555,0.256252,79.0,518700.0,6565.822785
2,TX,Texas,Southern,33.0,1400.476648,0.553786,990.034921,944.24728,76901.574345,66839.803002,...,1373690.0,2323976.0,0.352218,0.09563,0.232093,0.118903,0.201157,92.0,237200.0,2578.26087


Deciding on values to Keep

In [7]:
df= df[['StateAbbreviation', 'State', 'Region',
      'debt', 'Mean_Rental_Value','hs_degree','rent_gt_10','rent_gt_50', 
      'Top_10_Mean_Pop','Bachelor\'s Degree Holders','hi_mean', 'Science and Engineering',  'Business'
       ]]

In [8]:
df = df.dropna()

I'm now going to create a final score based on multiplying and dividing values based on their benefit or detriment to their citizens. I'll then find a logarithmically appropriate final value to use for analysis.

I settled on:
* debt: a % value for the populace of each state
* Mean_Rental_Value: the cost of rent caluclated from a different dataset.
* hs_degree: the % of the populace who at least graduated hs
* rent_gt_10: % of populace with rent greater than 10% of their income
* rent_gt_50: % of populace with rent greater than 50% of their income
* Top_10_Mean_Pop:The Mean population of the top 10 largest Cities per State
* 'Bachelor\'s Degree Holders':# of Bachelors degrees
* hi_mean: household income

With these I am looking to aim for the ideal of: low rent, high population (opportunities), low debt, and a good highschool graduation rate (cares about basic education for populace)

In [9]:
def min_max_scaling(column):
    return (column - column.min())/ (column.max() - column.min())

In [10]:
df['min_debt_rank'] = np.log(df['debt'].rank(method='min', ascending=False))
df['min_rent_rank'] = np.log(df['Mean_Rental_Value'].rank(method='min', ascending=False))
df['max_hsdeg_rank'] = np.log(df['hs_degree'].rank(method='max', ascending=True))
df['min_rentgt10_rank'] = np.log(df['rent_gt_10'].rank(method='min', ascending=False))
df['min_rentgt50_rank'] = np.log(df['rent_gt_50'].rank(method='min', ascending=False))

df['max_opportunity_rank'] = np.log(df['Top_10_Mean_Pop'].rank(method='max', ascending=True))
df['max_bachelors_rank'] = np.log(df['Bachelor\'s Degree Holders'].rank(method='max', ascending=True))
df['max_house_income_rank'] = np.log(df['hi_mean'].rank(method='max', ascending=True))
df['max_sci_eng_degs_rank'] = np.log(df['Science and Engineering'].rank(method='max', ascending=True))
df['max_buss_degs_rank'] = np.log(df['Business'].rank(method='max', ascending=True))


In [11]:
cols = ['min_rent_rank', 'min_debt_rank', 'max_hsdeg_rank',
       'min_rentgt10_rank', 'min_rentgt50_rank', 'max_opportunity_rank',
       'max_bachelors_rank', 'max_house_income_rank', 'max_sci_eng_degs_rank',
       'max_buss_degs_rank']
for col in df[cols]:
    df[col] = min_max_scaling(df[col])

This step is to ensure that the values are normalized against eachother, so that no individual item is attributed more weight.

In [12]:
df['tally'] = df[['min_rent_rank', 'min_debt_rank', 'max_hsdeg_rank',
       'min_rentgt10_rank', 'min_rentgt50_rank', 'max_opportunity_rank',
       'max_bachelors_rank', 'max_house_income_rank', 'max_sci_eng_degs_rank',
       'max_buss_degs_rank']].sum(axis = 1)

In [13]:
df.to_csv('Final_Data/Full_Weighted_States.csv')

In [14]:
tally_lookup = df[['StateAbbreviation','State', 'Region', 'tally']].sort_values(by='tally', ascending = False).reset_index(drop=True)

In [15]:
tally_lookup

Unnamed: 0,StateAbbreviation,State,Region,tally
0,PA,Pennsylvania,Northeastern,8.969821
1,OH,Ohio,Midwestern,8.749037
2,WI,Wisconsin,Midwestern,8.644318
3,IL,Illinois,Midwestern,8.62631
4,MO,Missouri,Midwestern,8.595172
5,MN,Minnesota,Midwestern,8.57914
6,KS,Kansas,Midwestern,8.513933
7,IA,Iowa,Midwestern,8.481531
8,TX,Texas,Southern,8.379764
9,NC,North Carolina,Southern,8.347259


In [16]:
tally_lookup['tally'] = min_max_scaling(tally_lookup['tally'])
tally_lookup.to_csv('Final_Data/Weighted_States_Lookup.csv')

In [17]:
regional_tally = tally_lookup.groupby('Region').mean().sort_values(by='tally', ascending=False)

In [18]:
regional_tally

Unnamed: 0_level_0,tally
Region,Unnamed: 1_level_1
Midwestern,0.803978
Southern,0.621801
Western,0.588629
Northeastern,0.557963
California,0.267738
Special,0.246143


It seems to me like access to opportunities is overshadowed by the lower debt/rent.

DC(Special) and California make up the 2 last spots, which also corresponds with poor COL balance.

In [19]:

regional_tally.to_csv('Final_Data/Weighted_Region_Lookup.csv')