In [1]:
import pandas as pd

import Clean_data as cld
import Transform_Data as tfd
import Visualize_Data as vld

In [2]:
merged_rent_df =  cld.get_zhvi_array()

In [3]:
print(merged_rent_df['State'].unique())

['NY' 'CA' 'TX' 'IL' 'PA' 'AZ' 'NV' 'FL' 'IN' 'NC' 'OH' 'KY' 'CO' 'WA'
 'TN' 'MA' 'DC' 'OR' 'MD' 'WI' 'NE' 'NM' 'OK' 'GA' 'MO' 'VA' 'MN' 'HI'
 'LA' 'KS' 'SC' 'AK' 'NJ' 'ID' 'IA' 'MI' 'UT' 'AR' 'RI' 'SD' 'MS' 'CT'
 'DE' 'AL' 'WY' 'MT' 'ND' 'WV' 'ME' 'NH' 'VT']


Confirming that all 50 states + DC appear in my dataset

In [4]:
state_year_grouped_df = tfd.group_house_dfs(merged_rent_df) 
#Aggregate['State','Year','size']['value'] on: 'count', 'mean', 'median', 'std', 'min', 'max'

I discovered that over half my values were NA, so I've taken the extra step of cleaning out all NA values

In [5]:
state_year_grouped_df.shape

(816, 6)

I have two groups of data to focus on:
10,167 rows of ungrouped data, pertinent to individual locations.
582 rows of grouped data, reshaped with indexes for state, year and size.

I have opted to leave the 'all size' data out, as the granularity of room count adds a measure of cohesion to my findings in regards to cost and location.

# High End Cost of Living

Finding our most expensive locations (from 0 - a max value of (years*bedrooms = 12):

In [6]:
largest_min = tfd.return_nlargest_state_count(state_year_grouped_df,50, 'min') 
largest_max = tfd.return_nlargest_state_count(state_year_grouped_df,50, 'max') 
largest_mean = tfd.return_nlargest_state_count(state_year_grouped_df,50, 'mean') 
largest_med = tfd.return_nlargest_state_count(state_year_grouped_df,50, 'median') 
largest_std = tfd.return_nlargest_state_count(state_year_grouped_df,50, 'std') 

In [7]:
count_expensive_dfs = [largest_min,largest_max,largest_mean,largest_std, largest_med]

In [8]:
e_merged_count_df = tfd.return_merged_count(count_expensive_dfs)


I then merge those 5 columns in order to get a score of 0-60, in doing so I also discovered that only 25 states made the top cut in any way. Because of this I will be performing an entirely separate low-end exploration later.

In [9]:
e_largest = e_merged_count_df.sum(axis=1).nlargest(25)

In [10]:
e_largest = e_largest.reset_index()

In [11]:
e_largest.head(10)

Unnamed: 0,State,0
0,HI,56.0
1,CA,33.0
2,FL,26.0
3,WA,15.0
4,MA,13.0
5,RI,13.0
6,CO,10.0
7,NJ,9.0
8,NY,8.0
9,NV,6.0


In [12]:
print(len(e_largest))

19


California is by far the most expensive place to live. Massachusetts and Florida are a big drop for 2nd and 3rd place.

Because of how difficult it seems to be to even make the list, I'll be using the top 5 results.

In [13]:
high_end_state_df = tfd.return_high_end(merged_rent_df)

In [14]:

high_end_state_df.to_csv('Datasets/House Price/High_Tier_RENT.csv')

# Lowest Cost of Living

In [15]:
smallest_min = tfd.return_nsmallest_state_count(state_year_grouped_df,50, 'min') 
smallest_max = tfd.return_nsmallest_state_count(state_year_grouped_df,50, 'max') 
smallest_mean = tfd.return_nsmallest_state_count(state_year_grouped_df,50, 'mean') 
smallest_med = tfd.return_nsmallest_state_count(state_year_grouped_df,50, 'median') 
smallest_std = tfd.return_nsmallest_state_count(state_year_grouped_df,50, 'std') 

In [16]:
count_cheapest_dfs = [smallest_min,smallest_max,smallest_mean,smallest_med, smallest_std]

In [17]:
c_merged_count_df = tfd.return_merged_count(count_cheapest_dfs)


In [18]:
c_largest = c_merged_count_df.sum(axis=1).nlargest(25)

In [19]:
c_largest = c_largest.reset_index()

In [20]:
c_largest.head(10)

Unnamed: 0,State,0
0,OK,39.0
1,WV,29.0
2,MS,27.0
3,AR,25.0
4,KY,23.0
5,IN,10.0
6,KS,10.0
7,IA,8.0
8,LA,8.0
9,ND,8.0


In [21]:
print(len(c_merged_count_df))

27


Much like before, I've gotten the count of occurance, by state, this time with the smallest values found. The spread is much more even. with 41 of the possible 51 regions appearing. I will likewise choose the top 5 values from this spread.

In [22]:
high_end_state_df = tfd.return_cheap_end(merged_rent_df)

In [23]:

high_end_state_df.to_csv('Datasets/House Price/Low_Tier_RENT.csv')

# Mid-Tier Cost of Living

In [24]:
merge = c_largest.merge(e_largest, left_on='State',right_on='State')

In [25]:
merge.columns = ['State', 'Cheapest', 'Highest']

In [26]:
merge

Unnamed: 0,State,Cheapest,Highest
0,MS,27.0,4.0
1,GA,7.0,1.0
2,WY,4.0,6.0
3,AK,3.0,3.0


I've got 8 values that fall into both tables, I'll tream these as 'mid-tier' states

In [27]:
mid_tier_state_df = tfd.return_mid_tier(merged_rent_df)

In [28]:

high_end_state_df.to_csv('Datasets/House Price/Mid_Tier_RENT.csv')