In [1]:
import pandas as pd

import Clean_data as cld
import Transform_Data as tfd
import Visualize_Data as vld
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Datasets

# Zillow Rent

z_rent = pd.read_csv('Datasets/House Price/State_MedianRentalPrice_1Bedroom.csv')
z_rent.head(1)

This is the 1 bedroom rental price data from the 'Zillow House Price Data' Dataset,

RegionName is State (This is different from the other provided datasets in the same public kaggle repository where regionName was more closely aligned with a city or group of areas in a city).

time-series data is pivoted and grouped by year-month

the first columns not Missing data begin in February of 2017, and continues through December of 2019 this does impact my initially planned data-range. However I still hope to use these two snapshots to establish State-level growth.

#### Cleaned

In [2]:
z_rent = cld.get_rent_df()
z_rent.head(5)

Unnamed: 0,State,size,Year,value
0,California,1br,2017,1900.0
1,Texas,1br,2017,1045.0
2,New York,1br,2017,2150.0
3,Florida,1br,2017,1335.0
4,Illinois,1br,2017,1390.0


#### Zscores
The lower the z-scores, the less variation from the mean my sample (created in vld.run_zscore) has.

Anything under .1 has low statistical variability/uncertainty, and therefore high confidence.

In [3]:
vld.run_zscore(z_rent, ['value'])

full length  398
sample size  26
df Z-Scores
value   -0.030443
dtype: float64


#### Graphs

In [4]:
z_rent_17 = z_rent[z_rent['Year']== '2017']
fig = px.density_heatmap(z_rent_17, x = 'State', y='size',
            title = 'Rental Costs by State and Size(2017)',
            z='value'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'coloraxis': 'coloraxis',
              'histfunc': 'sum',
              'hoverte…

In [5]:
z_rent_20 = z_rent[z_rent['Year']== '2020']
fig = px.density_heatmap(z_rent_20, x = 'State', y='size',
            title = 'Rental Costs by State and Size(2020)',
            z='value'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'coloraxis': 'coloraxis',
              'histfunc': 'sum',
              'hoverte…

As a whole, rent has become more expensive (lighter colored blocks).

# Zillow House Sales

In [6]:
z_house = pd.read_csv('Datasets/House Price/State_Zhvi_1Bedroom.csv')

z_house.head(1)

Unnamed: 0.1,Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,1996-01-31,1996-02-29,1996-03-31,1996-04-30,...,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31
0,0,9,0,California,State,CA,104045.0,103897.0,103745.0,103456.0,...,399503.0,399949.0,400604.0,401758.0,402858.0,404068.0,406004.0,408468.0,411313.0,414067.0


This 1 bedroom house sales data, also from the 'Zillow House Price Data' Dataset, is more complete, with the last missing column being in 2005.

Because of my rental snapshot's limitations I will not be using data prior to 2017, for the sake of impartial analytics.

#### Cleaned

In [7]:
 
z_house = cld.get_zhvi_array()
z_house.head(5)

Unnamed: 0,State,size,Year,value
0,California,1br,2017,347962.0
1,Texas,1br,2017,96795.0
2,New York,1br,2017,329115.0
3,Florida,1br,2017,117818.0
4,Illinois,1br,2017,130350.0


#### Zscores

In [8]:
vld.run_zscore(z_house, ['value'])

full length  408
sample size  27
df Z-Scores
value    0.185467
dtype: float64


#### Graphs

In [9]:
z_zhvi_17 = z_house[z_house['Year']== '2017']
fig = px.density_heatmap(z_zhvi_17, x = 'State', y='size',
            title = 'Sales Costs by State and Size(2017)',
            z='value'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'coloraxis': 'coloraxis',
              'histfunc': 'sum',
              'hoverte…

In [10]:
z_zhvi_20 = z_house[z_house['Year']== '2020']
fig = px.density_heatmap(z_zhvi_20, x = 'State', y='size',
            title = 'Sales Costs by State and Size(2020)',
            z='value'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'coloraxis': 'coloraxis',
              'histfunc': 'sum',
              'hoverte…

Likewise, sales prices seem to have increased. though the market for house sales is much more uneven, many more dark (cheap) spots than the rental graphs.

#### Further Exploration

In [11]:

housing_merged_loc = tfd.consolidate_sale_rent(z_rent, z_house)

Because Rent and House sales are so closely related, I'm consolidating them

#### Rental and Sales prices, by year and size, over time

In [12]:
merged_rent_sale = tfd.get_rent_sale_growth()

merged_rent_sale = merged_rent_sale.set_index('state')
merged_rent = merged_rent_sale.loc[:, merged_rent_sale.columns.str.contains('rent')]
merged_sale = merged_rent_sale.loc[:, merged_rent_sale.columns.str.contains('sell')]

In [13]:
merged_rent.dropna(axis=1).head(3)

Unnamed: 0_level_0,rent_value_17_1,rent_value_20_1,rent_value_20_2,rent_value_20_3
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
California,1900.0,2100.0,2495.0,2895.0
Texas,1045.0,1200.0,1425.0,1550.0
New York,2150.0,2100.0,2300.0,2500.0


In [14]:
merged_sale.dropna(axis=1).head(3)

Unnamed: 0_level_0,sell_value_17_1,sell_value_20_1,sell_value_20_2,sell_value_20_3
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
California,347962.0,408468.0,475167.0,532205.0
Texas,96795.0,116946.0,136649.0,193582.0
New York,329115.0,369933.0,182737.0,195764.0


I've just realized, because of previously unnoticed missing data, that I will have to compare 1 bedroom houses for both the sold and rental market

In [15]:
merged_rent = merged_rent[['rent_value_17_1','rent_value_20_1']]
merged_sale = merged_sale[['sell_value_17_1','sell_value_20_1']]
merged_1br = merged_sale.merge(merged_rent, left_index= True, right_index=True)
merged_1br['sale_growth'] = merged_1br['sell_value_20_1'] / merged_1br['sell_value_17_1']
merged_1br['rent_growth'] = merged_1br['rent_value_20_1'] / merged_1br['rent_value_17_1']

##### New Data

In [16]:
merged_1br.to_csv('Final_Data/Further_Datasets/Rent_Sale_growth.csv')

# Time To Sell

In [17]:

z_time_sell = pd.read_csv('Datasets/House Price/DaysOnZillow_State.csv')

In [18]:
z_time_sell.head(1)

Unnamed: 0.1,Unnamed: 0,SizeRank,RegionID,RegionName,RegionType,2010-01,2010-02,2010-03,2010-04,2010-05,...,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02
0,0,1.0,9,California,State,108.0,115.0,107.0,102.0,89.0,...,56,57.0,60,62.0,65,65.0,64.0,71.0,79,69.0


'Days on Zillow-State' is another dataset that I will be using select data from. Currently it's a bit heavy-handed

#### Cleaned

In [19]:

z_time_sell =cld.get_state_days()
z_time_sell.head(5)

Unnamed: 0,State,Year,value
0,California,2017,84.0
1,Texas,2017,84.0
2,New York,2017,154.0
3,Florida,2017,103.0
4,Illinois,2017,126.0


The resulting datasets are much cleaner, much less visually cluttered.

#### Zscores

In [20]:
vld.run_zscore(z_time_sell, ['value'])

full length  102
sample size  6
df Z-Scores
value    0.565987
dtype: float64


State, Sex, Age Group values had the best zscores of the bunch. They were also the highest row-count. even with the data split across 4 columns

### Graphs

In [21]:

fig = px.bar(z_time_sell, x = 'State', y='value')
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'State=%{x}<br>value=%{y…

# City Population

In [22]:

z_pop16 = pd.read_csv('Datasets/Population/2016_Us_Population_By_City.csv')
z_pop20 = pd.read_csv('Datasets/Population/2020_Us_Population_By_City.csv')
z_pop16.head(1)

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016
0,40,1,0,0,0,0,0,A,Alabama,Alabama,4779736,4780131,4785492,4799918,4815960,4829479,4843214,4853875,4863300


In [23]:
z_pop20.head(1)

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,...,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE042020,POPESTIMATE2020
0,40,1,0,0,0,0,0,A,Alabama,Alabama,...,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4920706,4921532


Because '2020_Us_Population_By_City' actually contains data for both 2017 and 2019, that will be my only dataset for population

#### Cleaned

In [24]:

z_pop17_20 =cld.get_population_data()
z_pop17_20.head(5)

Unnamed: 0,State,Year,value
0,Alabama,2017,17612945
1,Alabama,2020,17774905
2,Alaska,2017,2720405
3,Alaska,2020,2682488
4,Arizona,2017,26743312


#### Zscores

In [25]:
vld.run_zscore(z_pop17_20, ['value'])

full length  102
sample size  6
df Z-Scores
value   -0.469406
dtype: float64


#### Graphs

In [30]:

fig = px.bar(z_pop17_20, x = 'State', y='value',
            title = 'Sales Costs by State and Size(2020)',
            color='Year',
             barmode = 'group'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'Year=2017<br>State=%{x}…

There is not a massively difference between 2017 and 2020, percentage-wise.

#### Further Exploration

In [None]:
merged_loc =tfd.consolidate_sell_pop(z_time_sell, z_pop17_20)
m_17 = merged_loc[merged_loc['year'] == '2017']
m_20 = merged_loc[merged_loc['year'] == '2020']

growth_rate1 = m_17.merge(m_20, left_on=['state'], right_on=['state'])

growth_rate1['sell_time_change'] = growth_rate1['days_to_sell_y'] /  growth_rate1['days_to_sell_x']
growth_rate1['pop_growth'] = growth_rate1['population_y'] /  growth_rate1['population_x']
growth_rate1 = growth_rate1[['state', 'sell_time_change', 'pop_growth']]
growth_rate1.head(3)

from 2017 to 2020, these are the days to sell (under 1 means selling quicker, over 1 means market has slowed down) as well as population growth.

In [None]:
growth_rate1.std()

There is a much tighter range for deviation in our population growth column (10% sell_time vs almost 2% pop change)

In [None]:
growth_rate1 =tfd.append_zscores(growth_rate1, 'sell_time_change', 'sell_z')
growth_rate1 =tfd.append_zscores(growth_rate1, 'pop_growth', 'pop_growth_z')
fig =px.scatter(growth_rate1, x = 'sell_z', y='pop_growth_z',
            title =  'Deviation of saletime vs pop growth',
            hover_data =['state']
            )
fig['layout'].update(height=800, width = 1600)
fig_widget = go.FigureWidget(fig)
fig_widget

##### New Data

In [None]:
growth_rate1.to_csv('Final_Data/Further_Datasets/State_Growth_Rate.csv', index=False)

# State, Sex, Age and Degree Data

In [None]:

s_s_a_deg = pd.read_csv('Datasets/Regional Education/Bachelor_Degree_Majors.csv')
s_s_a_deg.head(1)

Data isn't totally  out of control, I'll mostly be cleaning

#### Cleaned

In [None]:

st_ag_deg = cld.state_agegroup_degree_majors()
st_ag_deg.head(5)

#### Zscores

In [None]:
vld.run_zscore(st_ag_deg, ['value'])

#### Graphs

In [None]:
fig =px.bar(st_ag_deg, x = 'State', y='value',
            title =  'Degree Counts By state and age',
            color= 'Field',
            range_color=[0,1000000]
            )
fig['layout'].update(height=800, width = 1600)
fig_widget = go.FigureWidget(fig)
fig_widget

as age increases, there are very clear decreases in degree counts. Texas was a suprisingly strong contender for the 40 to 64 crowd.

#### Further Exploration

In [None]:
loc_field_focus = tfd.group_state_degree_data(st_ag_deg)
loc_field_focus = tfd.add_state_region(loc_field_focus, 'State')
fig = px.pie(loc_field_focus, values='Total',color = 'Region',hover_data =[loc_field_focus['State']], title='Total Bachelors by State',
            height = 700, width = 700, hole = .1)
fig.update_layout(margin=dict(t=0, b=0, l=0, r=0))
fig.update_traces(textinfo='none')
fig.show()

Above is a pie chart, with bachelors degrees (total) by state

California, Texas and New York alone make up a quarter of all bachelors degrees.

These are also colorcoded by Region

In [None]:
loc_field_focus_reg = loc_field_focus.groupby('Region').sum().reset_index()

fig = px.pie(loc_field_focus_reg, values='Total', title='Total Bachelors by Region',
            height = 700, width = 700, hole = .1, color='Region')
fig.update_layout(margin=dict(t=0, b=0, l=0, r=0))
fig.show()

In [None]:
loc_field_focus.groupby('Region')['State'].count()

The south has nearly double the states of the Northeast.

In [None]:
loc_field_focus.groupby('Region')['Total'].mean()

The average amount of degrees held per state in a region is highest in the Northeastern Region.

In [None]:
loc_age_deg = tfd.group_age_degree_data(st_ag_deg)

In [None]:
loc_age_deg_bar = loc_age_deg[['Arts, Humanities and Others', 'Business', 'Education',
       'Sci_Eng_Related', 'Science and Engineering']]
fig = px.bar(loc_age_deg_bar, x = loc_age_deg_bar.index, y=loc_age_deg_bar.columns,
            title = 'Degrees by age group')

fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

Observations:

* Science and engineering is always in demand
* the older crowd has the most prominent Education presence
* 40 to 64 see's the highest number of Bachelors degree holders(according to the original dataset)

In [None]:
loc_field_focus.to_csv('Final_Data/Further_Datasets/State_Bachelor_Counts.csv', index=False)
loc_age_deg.to_csv('Final_Data/Further_Datasets/Age_Bachelor_Counts.csv', index=False)