In [1]:
import pandas as pd

import Clean_data as cld
import Transform_Data as tfd
import Visualize_Data as vld
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [2]:
### Location
location_rental_prices = pd.read_csv('Final_Data/ETL/zillow_rental_prices.csv') 
location_house_prices = pd.read_csv('Final_Data/ETL/zillow_house_prices.csv') 
location_house_sell_time = pd.read_csv('Final_Data/ETL/zillow_days_to_sell.csv')
location_state_pop = pd.read_csv('Final_Data/ETL/state_population_counts.csv')
location_state_age_deg = pd.read_csv('Final_Data/ETL/state_agegroup_degree_majors.csv') 

location_growth = pd.read_csv('Final_Data/Further_Datasets/State_Growth_Rate.csv')
age_bachelor = pd.read_csv('Final_Data/Further_Datasets/Age_Bachelor_Counts.csv')
rent_sale_Growth = pd.read_csv('Final_Data/Further_Datasets/Rent_Sale_growth.csv')

### Education
education_industry_counts = pd.read_csv('Final_Data/Manually Altered/Industry_counts_global.csv') 
education_deg_to_job = pd.read_csv('Final_Data/Manually Altered/degree_to_job_title_count.csv')  
education_deg_payback = pd.read_csv('Final_Data/ETL/deg_payback.csv')


bachelor_counts = pd.read_csv('Final_Data/Further_Datasets/State_Bachelor_Counts.csv')
### Salary
salary_glassdoor_df = pd.read_csv('Final_Data/Manually Altered/glassdoor_best_jobs.csv') 
salary_demographics = pd.read_csv('Final_Data/ETL/state_demographics.csv') 

reg_salaries = pd.read_csv('Final_Data/Further_Datasets/Regional_Salaries.csv')

# variables
min_wage = 7.25
annual_min = min_wage * 2080
monthly_wage = min_wage * 160

### Hypothesis
##### Can I categorize States, based on several parameters such as: Population, employment opportunities, available schooling, degree focus , and income; Using these categorizations, can I weigh certain states based on their likelihood to favor a combination of degrees, salaries, and housing situations. 

#### Question 1:
Which degrees are the most in demand *and* pay well

In [3]:
field = tfd.combine_demand(education_industry_counts, education_deg_to_job, education_deg_payback,location_state_age_deg)

In [4]:
field = field.set_index(keys='Field')

In [5]:
field

Unnamed: 0_level_0,Demand_Count,Degree_Fill_Count,start_salary,mid_salary,bachelor_count
Field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Arts, Humanities and Others",800980,6837695.0,35840.0,58440.0,16986990.0
Business,3876780,8143799.0,43785.714286,79157.142857,15080708.0
Education,303751,3457434.0,36566.666667,63233.333333,8749977.0
Sci_Eng_Related,1577977,5446758.0,44678.571429,71871.428571,7311745.0
Science and Engineering,2402102,11943074.0,50962.5,87806.25,26372231.0


In [6]:
ranked_fields = field.rank(numeric_only = True)
ranked_fields.sum(axis=1)

Field
Arts, Humanities and Others    11.0
Business                       19.0
Education                       8.0
Sci_Eng_Related                13.0
Science and Engineering        24.0
dtype: float64

First pass, Science/Engineering and Business are the top 2 positions.

In [7]:
field_log = np.log(field) # To reduce the individual weights

In [8]:
field_log.sum(axis=1)

Field
Arts, Humanities and Others    67.442087
Business                       69.578463
Education                      65.226042
Sci_Eng_Related                67.477061
Science and Engineering        70.297072
dtype: float64

Even with the different factors more equally balanced, Science/Engineering and Business are still the two most prominent contendors 

The have High Demand, Are the two top types of degrees earned in the US, Have the highest starting and mid-career salaries

#### Question 2: 
Which areas are the most focused on high-paying degrees, pay the best salaries in those fields and are offset by having the lowest Cost of Living

###### 2.1
Focused on high paying fields?


In [9]:
bachelor_counts = bachelor_counts.set_index(keys='State')

In [10]:
bachelor_ratio = tfd.get_bachelor_ratios(bachelor_counts)

top_5_sci_eng_states = bachelor_ratio.sort_values(by=['Science and Engineering'], ascending = False).head(5)   
top_5_sci_eng_states

Unnamed: 0_level_0,"Arts, Humanities and Others",Business,Education,Sci_Eng_Related,Science and Engineering,Total,Total_z,Region
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
District of Columbia,0.299384,0.129762,0.034008,0.051793,0.485053,301429.0,-0.67534,Southern
Washington,0.23723,0.160159,0.096178,0.085141,0.421292,1955632.0,0.288228,Western
California,0.256252,0.182542,0.061555,0.082876,0.416775,9428484.0,4.64114,Western
Maryland,0.22597,0.18558,0.08884,0.090832,0.408777,1710230.0,0.145282,Southern
Massachusetts,0.242111,0.177689,0.084413,0.088397,0.407389,2181743.0,0.419937,Northeastern


In [11]:
top_5_bus_states = bachelor_ratio.sort_values(by=['Business'], ascending = False).head(5)   
top_5_bus_states

Unnamed: 0_level_0,"Arts, Humanities and Others",Business,Education,Sci_Eng_Related,Science and Engineering,Total,Total_z,Region
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Florida,0.20058,0.243118,0.129059,0.107355,0.319888,4753637.0,1.918057,Southern
Alabama,0.194429,0.237358,0.159338,0.111192,0.297682,885357.0,-0.335203,Southern
Georgia,0.212615,0.236904,0.12948,0.093616,0.327385,2301568.0,0.489735,Southern
Texas,0.201157,0.232093,0.118903,0.09563,0.352218,5776533.0,2.513891,Southern
South Carolina,0.206144,0.231953,0.150337,0.100857,0.310709,1054559.0,-0.236644,Southern


The top 5 states, sorted by **sci/eng** and **business** are listed above. Remarkably, there is no overlap between them. Tofind the ideal for both/either I'll need to create a combine dcolumn

In [12]:
bachelor_ratio['ideal_degree'] = bachelor_ratio['Science and Engineering']  + bachelor_ratio['Business'] 

In [13]:
ideal_zscores= tfd.append_zscores(bachelor_ratio, 'ideal_degree',  'ideal_degree_z')

In [14]:
ideal_zscores.sort_values(by='ideal_degree',ascending = False).head(10)

Unnamed: 0_level_0,"Arts, Humanities and Others",Business,Education,Sci_Eng_Related,Science and Engineering,Total,Total_z,Region,ideal_degree,ideal_degree_z
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
District of Columbia,0.299384,0.129762,0.034008,0.051793,0.485053,301429.0,-0.67534,Southern,0.614815,2.12983
California,0.256252,0.182542,0.061555,0.082876,0.416775,9428484.0,4.64114,Western,0.599317,1.712451
Colorado,0.232056,0.204843,0.087637,0.084549,0.390915,1695602.0,0.136762,Western,0.595758,1.616595
New Jersey,0.212099,0.216115,0.105105,0.088412,0.378269,2551765.0,0.635474,Northeastern,0.594384,1.579578
Maryland,0.22597,0.18558,0.08884,0.090832,0.408777,1710230.0,0.145282,Southern,0.594357,1.578867
Virginia,0.239727,0.183816,0.090535,0.079575,0.406347,2325070.0,0.503425,Southern,0.590163,1.465908
Massachusetts,0.242111,0.177689,0.084413,0.088397,0.407389,2181743.0,0.419937,Northeastern,0.585078,1.328951
Texas,0.201157,0.232093,0.118903,0.09563,0.352218,5776533.0,2.513891,Southern,0.58431,1.308274
Washington,0.23723,0.160159,0.096178,0.085141,0.421292,1955632.0,0.288228,Western,0.581451,1.231262
Hawaii,0.212802,0.201513,0.114296,0.103479,0.367911,335209.0,-0.655663,Western,0.569424,0.907345


I've applied deviations to the ideal degrees (Sci/eng and Business)

In [15]:
# Hold onto this for a little later
ideal_zscores = ideal_zscores[['Business','Science and Engineering', 'Total',
       'Region', 'ideal_degree', 'ideal_degree_z']]

##### Results
Your best location, currently, is Washington DC.  And by a pretty large deviation from the norm, too.

9 of the top 10 States are Statistically sound choices, having a more pronounced focus on ideal degrees.

###### 2.2 Pay well, regionally?

In [16]:

reg_salaries = reg_salaries[['Region', 'Starting Median Salary', 'Mid-Career Median Salary']]

In [17]:
reg_salaries

Unnamed: 0,Region,Starting Median Salary,Mid-Career Median Salary
0,Midwestern,44225.352113,78180.28169
1,Northeastern,48496.0,91352.0
2,Southern,44521.518987,79505.063291
3,Western,47061.428571,84172.857143


In [18]:
print('Annual wage at Federal Minimum',annual_min)

Annual wage at Federal Minimum 15080.0


A college degree is on average three times what you would earn at the federal minimum wage. 

In [19]:
reg_salaries['start_over_min'] = reg_salaries['Starting Median Salary'] /annual_min
reg_salaries['mid_over_min'] = reg_salaries['Mid-Career Median Salary'] /annual_min

In [20]:
reg_salaries.sort_values('mid_over_min', ascending = False)

Unnamed: 0,Region,Starting Median Salary,Mid-Career Median Salary,start_over_min,mid_over_min
1,Northeastern,48496.0,91352.0,3.215915,6.057825
3,Western,47061.428571,84172.857143,3.120784,5.581754
2,Southern,44521.518987,79505.063291,2.952355,5.272219
0,Midwestern,44225.352113,78180.28169,2.932716,5.184369


##### Results
Mid-Career though, you will earn between 5 and 6 times the minimum wage, depending on your region, with the Northeastern Region at the top with a mid-career salary averaging at 91k

###### 2.3 Low cost of living? Hot Selling Market?

In [21]:
rent_sale_Growth = rent_sale_Growth[['state', 'sell_value_20_1', 
       'rent_value_20_1', 'sale_growth', 'rent_growth']]
rent_sale_Growth.columns = ['State', 'sell_value_2020', 
       'rent_value_2020', 'sale_growth', 'rent_growth']
rent_sale_Growth = tfd.add_state_region(rent_sale_Growth, 'State')

## Rank
rent_sale_Growth = rent_sale_Growth.set_index('State')
rent_sale_Growth_rank = rent_sale_Growth.rank(numeric_only=True).reset_index() ### Region wasn't returning right
rent_sale_Growth_rank = tfd.add_state_region(rent_sale_Growth_rank, 'State')

In [22]:
rent_sale_Growth.head(3)

Unnamed: 0_level_0,sell_value_2020,rent_value_2020,sale_growth,rent_growth,Region
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,408468.0,2100.0,1.173887,1.105263,Western
Texas,116946.0,1200.0,1.208182,1.148325,Southern
New York,369933.0,2100.0,1.124024,0.976744,Northeastern


In [23]:
rent_sale_Growth.corr()

Unnamed: 0,sell_value_2020,rent_value_2020,sale_growth,rent_growth
sell_value_2020,1.0,0.841705,-0.067872,-0.155105
rent_value_2020,0.841705,1.0,-0.103592,-0.114423
sale_growth,-0.067872,-0.103592,1.0,0.579879
rent_growth,-0.155105,-0.114423,0.579879,1.0


These aren't the strongest signals, but I do find it valuable to note that as sell value increases, sale growth does not see much of a drop (the market stays relatively stable, regardless of cost increases) but the rental_growth *relative* to sale_growth plummets, with a nearly 3x decrease as sale_growth increases.

Those signals are much more stable when rent_value  to growth is assessed at a fairly steady -10%.

In [24]:
fig = px.scatter(rent_sale_Growth.rank(), x = 'sell_value_2020',
                 y='sale_growth',
                 color =rent_sale_Growth['Region'],
                 title = 'Ranked Growth and Sell Values',
                 trendline = 'ols',
                 hover_data = {'sell value raw': rent_sale_Growth['sell_value_2020'], 'sell growth raw': rent_sale_Growth['sale_growth'],'state': rent_sale_Growth.index}
            )
fig['layout'].update(height=600)
fig.update_traces(marker={'size': 30})
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'customdata': array([[408468.0, 1.1738868037314418, 'California'],
              …

This graph helps highlight some of the macro-trends at play here.
* Southern States snag both the cheapest, and most expensive slots for sales, while also seeing some of the largest and smallest growth numbers.
* Western States were all on the highesr end of sale value, with their lowest ranked (Nevada) also being the number one for growth
* Northeastern States also trended towards the top of sales, though their sales growth never got quite as high in the rankings.
* MidWestern States were predominantely the lower valued sector, though frowth was still well dispersed.

In [25]:
regional_growth = rent_sale_Growth.groupby('Region').mean()
regional_growth

Unnamed: 0_level_0,sell_value_2020,rent_value_2020,sale_growth,rent_growth
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Midwestern,103056.083333,884.541667,1.17208,1.111737
Northeastern,205742.222222,1493.222222,1.148229,1.164021
Southern,144647.0,1148.705882,1.154743,1.129414
Western,235744.923077,1213.615385,1.222027,1.168555


Looking at the raw data, I was abloe to see some regional trends
* The cheapest homes are located in the midwest, and by a large margin
* despite Western State having the highest Sell value, their rent is still 200 dollars lower than Northeastern States
* sale growth was most pronounced in Western States 

In [26]:
rent_sale_Growth_rank.groupby('Region').mean()

Unnamed: 0_level_0,sell_value_2020,rent_value_2020,sale_growth,rent_growth
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Midwestern,12.916667,15.791667,26.916667,23.583333
Northeastern,33.666667,37.666667,21.333333,30.222222
Southern,21.823529,25.529412,23.647059,24.941176
Western,38.230769,27.961538,31.461538,26.692308


To verify what I was seeing in the above chart and my ranked graph, I used a state-ranked, and then region-grouped chart to see the mean rank in each of the 4 regions. 
* Sales: Top Sell value was Western, Cheapest was Midwestern.
* Rent:  Top Rent was Northeastern, Cheapest was Midwestern.
* Sale Growth: Top growth was Western, Lowest Growth was Northeastern
* Rent Growth: Top growth was Northeastern, Lowest was Midwestern.


In [27]:
location_growth.columns = ['State', 'sell_time_change', 'pop_growth', 'sell_z', 'pop_growth_z']
location_growth = tfd.add_state_region(location_growth, 'State')

Location growth, to be clear, tracks the population change in an area, as well as the rapidity of the housing market. Lower time on market should indicate more interest in an area 

In [28]:
location_growth.corr()

Unnamed: 0,sell_time_change,pop_growth,sell_z,pop_growth_z
sell_time_change,1.0,0.101559,1.0,0.101559
pop_growth,0.101559,1.0,0.101559,1.0
sell_z,1.0,0.101559,1.0,0.101559
pop_growth_z,0.101559,1.0,0.101559,1.0


Similar to sell/rent values, there is not much correlation (10%) between sell time and pop growth

In [29]:
fig = px.scatter(location_growth.rank(), x = 'sell_time_change',
                 y='pop_growth',
                 color =rent_sale_Growth['Region'],
                 title = 'Ranked Pop Growth and Sell Times',
                 trendline = 'ols',
                 hover_data = {'sell time raw': location_growth['sell_time_change'], 'pop growth raw': location_growth['pop_growth'],'state': location_growth['State']}
            )
fig['layout'].update(height=600)
fig.update_traces(marker={'size': 30})
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'customdata': array([[0.9404761904761904, 1.0008489347047242, 'California'],
    …

In [30]:
location_growth = location_growth.set_index('State')
location_growth_rank = location_growth.rank(numeric_only=True).reset_index() ### Region wasn't returning right
location_growth_rank = tfd.add_state_region(location_growth_rank, 'State')

In [31]:
location_growth_rank.groupby('Region').mean()

Unnamed: 0_level_0,sell_time_change,pop_growth,sell_z,pop_growth_z
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Midwestern,30.041667,22.5,30.041667,22.5
Northeastern,17.444444,14.888889,17.444444,14.888889
Southern,21.882353,29.411765,21.882353,29.411765
Western,33.576923,32.461538,33.576923,32.461538


Compared to rent and house sales :
* Southern States are once again all over the board.
* Western States saw the highest ranked sell time uptick, and population growth
* Northeastern States saw the lowest population growth, and a wide spread of sell_time changes
* MidWestern States were relatively middle of the pack for both population growth and sell time

In [32]:
location_growth_rank = location_growth_rank[['State', 'sell_time_change', 'pop_growth',
       'Region']]
rent_sale_Growth_rank = rent_sale_Growth_rank[['State', 'sell_value_2020', 'rent_value_2020', 'sale_growth',
       'rent_growth']]
merg_rank_df = location_growth_rank.merge(rent_sale_Growth_rank, on='State')

In [33]:
merg_rank_df.corr()

Unnamed: 0,sell_time_change,pop_growth,sell_value_2020,rent_value_2020,sale_growth,rent_growth
sell_time_change,1.0,0.045251,0.209693,0.084614,-0.190416,-0.338975
pop_growth,0.045251,1.0,0.130588,0.028784,0.556742,0.308145
sell_value_2020,0.209693,0.130588,1.0,0.737209,-0.047602,-0.127964
rent_value_2020,0.084614,0.028784,0.737209,1.0,-0.05268,-0.030775
sale_growth,-0.190416,0.556742,-0.047602,-0.05268,1.0,0.538552
rent_growth,-0.338975,0.308145,-0.127964,-0.030775,0.538552,1.0


In [34]:
fig = px.scatter(merg_rank_df.rank(), x = 'sale_growth',
                 y='pop_growth',
                 color =rent_sale_Growth['Region'],
                 title = 'Ranked Pop Growth and Sell Times',
                 trendline = 'ols',
                 hover_data = {'sell time raw': merg_rank_df['sell_time_change'], 'pop growth raw': merg_rank_df['pop_growth'],'state': merg_rank_df['State']}
            )
fig['layout'].update(height=600)
fig.update_traces(marker={'size': 30})
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'customdata': array([[41.0, 14.0, 'California'],
                                …

pronounced population growth was correlated with:
* a minimal possitive impact on sale time
* a 13% correlation with sell value
* a minimal impact on rent value
* a 55% correlation to sale growth (price of houses jumped drastically)
* a 30% correaltion to rent growth

To answer my hot selling market question (2.3) (Does the price of homes/rent severely impact peoples willingness to move there):
* Sell value was not a deciding factor on whether population increased in an area, however the areas that did see a population boom also saw a measured increase in rent and house costs; Houses did not sell remarkably faster, though, because of the population growth. 

In conclusion, based on 2017 vs 2020 the time to sell hardly altered, but the more populated a city became, the more sellers were able to charge the population influx.

This lines up with **Supply and Demand**. In areas of rapid growth, as population began to see an uptick, there was more competition (demand) for houses (the limited supply).

##### Results

In conclusion
* Areas of lower population, with high potential growth make for good investments, areas that are already well populated/plateaued will likely not see a drastic increase in sales value (13% correlation), but the houses there will still sell at roughly the same speed (10% correlation).
* The cheapest place to live regionally is the midwest, hands down, however, on a state level the data is much more obtuse.

#### Question 3: 
On a regional Level, does the data provided by aggregating states indicate the best starting point to choose a long-term living location.furthermore Do any states offer a respectable trinity of Affordability, Opportunity, and Education.

For this section, I am going to create a table, merged from many other tables, to help visually pair information about each degree category, job category, as well as state (and region). I will then divide that one table into 3 more informative ones which I will sum z-scores to programatically to address my hypothesis with the 3 overarching themes:
* Education: How much weight does any particular degree lend to securing a good salary, regardless of location
* Salary: Whether the worker is able to get a remote position or not does a degree have a statistically significant likelihood of a good average salary.
* Affordable Location: Which Areas are Affordable, or said another way, offer salaries to offset the rent; additionally, which areas are likely to appreciate in value significantly over time. 


#### Education

Datasets and topical metrics
* education_deg_to_job [category, count]
* salary_glassdoor_df [category, satisfaction]
* education_industry_counts [category, count]
* location_state_age_deg [ field, value]


In [35]:
edu_cat_ct = education_deg_to_job.groupby('category')['count'].sum().reset_index()
edu_cat_ct = tfd.append_zscores(edu_cat_ct, 'count', 'd2j_score')
    

In [36]:
edu_cat_ct

Unnamed: 0,category,count,d2j_score
0,"Arts, Humanities and Others",6837695.0,-0.102979
1,Business,8143799.0,0.307016
2,Education,3457434.0,-1.164068
3,Sci_Eng_Related,5446758.0,-0.539605
4,Science and Engineering,11943074.0,1.499636


In [37]:
edu_gd_sat = salary_glassdoor_df.groupby('category')['satisfaction'].mean().reset_index()
edu_gd_sat = tfd.append_zscores(edu_gd_sat, 'satisfaction', 'sat_score')

In [38]:
edu_gd_sat

Unnamed: 0,category,satisfaction,sat_score
0,"Arts, Humanities and Others",3.933333,1.296194
1,Business,3.889655,0.720981
2,Education,3.8,-0.459719
3,Sci_Eng_Related,3.742857,-1.212254
4,Science and Engineering,3.808696,-0.345203


In [39]:
edu_ind_ct = education_industry_counts.groupby('category').sum().reset_index()
edu_ind_ct = tfd.append_zscores(edu_ind_ct, 'Count', 'indus_score')

In [40]:
edu_ind_ct

Unnamed: 0,category,Count,indus_score
0,"Arts, Humanities and Others",800980,-0.702682
1,Business,3876780,1.477511
2,Education,303751,-1.055128
3,Sci_Eng_Related,1577977,-0.151929
4,Science and Engineering,2402102,0.432228


In [41]:
edu_field_ct = location_state_age_deg.groupby('Field').sum().reset_index()
edu_field_ct = tfd.append_zscores(edu_field_ct, 'value', 'field_score')

In [42]:
edu_field_ct

Unnamed: 0,Field,value,field_score
0,"Arts, Humanities and Others",16986990.0,0.274344
1,Business,15080708.0,0.023715
2,Education,8749977.0,-0.808619
3,Sci_Eng_Related,7311745.0,-0.997711
4,Science and Engineering,26372231.0,1.508271


In [43]:
grp_df = edu_cat_ct.merge(edu_gd_sat, left_on = 'category', right_on = 'category', how='outer')
grp_df = grp_df.merge(edu_ind_ct, left_on = 'category', right_on = 'category', how='outer')
grp_df = grp_df.merge(edu_field_ct, left_on = 'category', right_on = 'Field', how='outer')

In [44]:
grp_df = grp_df[['category', 'd2j_score', 'sat_score',  'indus_score', 'field_score']]

In [45]:
grp_df = grp_df.set_index('category')
grp_df

Unnamed: 0_level_0,d2j_score,sat_score,indus_score,field_score
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Arts, Humanities and Others",-0.102979,1.296194,-0.702682,0.274344
Business,0.307016,0.720981,1.477511,0.023715
Education,-1.164068,-0.459719,-1.055128,-0.808619
Sci_Eng_Related,-0.539605,-1.212254,-0.151929,-0.997711
Science and Engineering,1.499636,-0.345203,0.432228,1.508271


In [46]:
fig = px.bar(grp_df, x = grp_df.index,
                 y=['d2j_score', 'sat_score', 'indus_score', 'field_score'],
                 title = 'Education Scores (z-Score sum)' ,  
             barmode='group'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'variable=d2j_score<br>c…

Now, with z-scores, I should be able to sum ratios, and have the result reflect a fair score for each Degree category

In [47]:
Education_Scores = grp_df.sum(axis=1)

In [48]:
Education_Scores.sort_values(ascending = False)

category
Science and Engineering        3.094932
Business                       2.529224
Arts, Humanities and Others    0.764877
Sci_Eng_Related               -2.901499
Education                     -3.487534
dtype: float64

Science and Engineering Degrees (top score) carry over 6 times the weight of Education Degrees (bottom score) based on
* Degrees To Job Counts
* Satisfaction (Glassdoor)
* Categorical Industry Counts
* Degrees (count) being offered in the field


#### Salary

In [49]:
salary_gd_df = salary_glassdoor_df.groupby('category')['salary'].mean().reset_index()
salary_gd_df = tfd.append_zscores(salary_gd_df, 'salary', 'salary_score')
salary_gd_df

Unnamed: 0,category,salary,salary_score
0,"Arts, Humanities and Others",87659.333333,0.402144
1,Business,89505.206897,0.623775
2,Education,70000.0,-1.718179
3,Sci_Eng_Related,84204.761905,-0.01264
4,Science and Engineering,90180.869565,0.7049


In [50]:
edu_deg_pb = education_deg_payback.groupby('Category').mean().reset_index()
edu_deg_pb= tfd.append_zscores(edu_deg_pb, 'Starting Median Salary', 'start_salary')
edu_deg_pb= tfd.append_zscores(edu_deg_pb, 'Mid-Career Median Salary', 'mid_salary')
edu_deg_pb

Unnamed: 0,Category,Starting Median Salary,Mid-Career Median Salary,start_salary,mid_salary
0,"Arts, Humanities and Others",35840.0,58440.0,-1.04017,-1.152813
1,Business,43785.714286,79157.142857,0.226152,0.595367
2,Education,36566.666667,63233.333333,-0.92436,-0.748336
3,Sci_Eng_Related,44678.571429,71871.428571,0.368448,-0.019425
4,Science and Engineering,50962.5,87806.25,1.369929,1.325207


In [51]:
grp_df = salary_gd_df.merge(edu_deg_pb, left_on='category', right_on='Category')

In [52]:
Salary_Scores = grp_df[['category', 'salary_score', 'start_salary','mid_salary']]
Salary_Scores = Salary_Scores.set_index('category').sum(axis=1)


#### Location

Datasets and topical metrics
* location_growth [sell_time_change, pop_growth]
* rent_sale_Growth [state, sell_value_2020, rent_value_2020, sale_growth, rent_growth]
* salary_demographics [state, hs_degree, debt, rent_gt_25, rent_gt_50, home_equity]
* location_state_age_deg [state, value(groupby state)]
* location_state_pop [State, year(2020), value]
* bachelor_ratio [ state, degree_ratios]

In [53]:
location_growth = location_growth[['sell_time_change', 'pop_growth']]
location_growth = tfd.append_zscores(location_growth, 'sell_time_change', 'sell_time_score')
location_growth = tfd.append_zscores(location_growth, 'pop_growth', 'pop_gr_score')
location_growth = location_growth.reset_index()

In [54]:
rent_sale_Growth = rent_sale_Growth[['sell_value_2020', 'rent_value_2020', 'sale_growth', 'rent_growth']]
rent_sale_Growth = tfd.append_zscores(rent_sale_Growth, 'sell_value_2020', 'sell_v_score')
rent_sale_Growth = tfd.append_zscores(rent_sale_Growth, 'rent_value_2020', 'rent_v_score')
rent_sale_Growth = tfd.append_zscores(rent_sale_Growth, 'sale_growth', 'sale_g_score')
rent_sale_Growth = tfd.append_zscores(rent_sale_Growth, 'rent_growth', 'rent_g_score')
rent_sale_Growth = rent_sale_Growth.reset_index()

In [55]:
salary_demographics = salary_demographics[['state', 'hs_degree', 'debt', 'rent_gt_25', 'rent_gt_50', 'home_equity']]

salary_demographics['debt'] = 1 - salary_demographics['debt'] #reversing polarity so low debt locations get a higher z-rank
salary_demographics['rent_gt_25'] = 1 - salary_demographics['rent_gt_25'] #reversing polarity so areas with rent less than 25% of income  get a higher z-rank
salary_demographics['rent_gt_50'] = 1 - salary_demographics['rent_gt_50'] #reversing polarity so areas with rent less than 50% of income get a higher z-rank

salary_demographics = tfd.append_zscores(salary_demographics, 'hs_degree', 'highs_score')
salary_demographics = tfd.append_zscores(salary_demographics, 'rent_gt_25', 'rent_u25_score')
salary_demographics = tfd.append_zscores(salary_demographics, 'rent_gt_50', 'rent_u50_score')
salary_demographics = tfd.append_zscores(salary_demographics, 'home_equity', 'h_eq_score')

In [56]:
location_state_age_deg  = location_state_age_deg.groupby('State')['value'].mean().reset_index()
location_state_age_deg =  tfd.append_zscores(location_state_age_deg, 'value', 'avg_salary_score')

In [57]:
location_state_pop = location_state_pop[location_state_pop['Year'] == 2020]
location_state_pop =location_state_pop[['State', 'value']]
location_state_pop['rev_value'] = max(location_state_pop['value']) - location_state_pop['value'] # reversing polarity so smaller populations are the dominant factor (based on my Location exploration findings)
location_state_pop =  tfd.append_zscores(location_state_pop, 'rev_value', 'overpop_score')

In [58]:
bachelor_ratio= bachelor_ratio[['Arts, Humanities and Others', 'Business', 'Education',
       'Sci_Eng_Related', 'Science and Engineering']]

In [59]:
bachelor_ratio = tfd.append_zscores(bachelor_ratio, 'Arts, Humanities and Others', 'AHO_score')
bachelor_ratio = tfd.append_zscores(bachelor_ratio, 'Business', 'BUS_score')
bachelor_ratio = tfd.append_zscores(bachelor_ratio, 'Education', 'EDU_score')
bachelor_ratio = tfd.append_zscores(bachelor_ratio, 'Sci_Eng_Related', 'SER_score')
bachelor_ratio = tfd.append_zscores(bachelor_ratio, 'Science and Engineering', 'SCE_score')


In [60]:
norm = np.linalg.norm(Education_Scores)
Education_Scores_n = 1 + (Education_Scores/norm)
Education_Scores_n
#All scores positive, and weights softened


category
Arts, Humanities and Others    1.125504
Business                       1.415005
Education                      0.427752
Sci_Eng_Related                0.523911
Science and Engineering        1.507829
dtype: float64

In [61]:
norm = np.linalg.norm(Salary_Scores)
Salary_Scores_n = 1 + (Salary_Scores/norm)
Salary_Scores_n
#Ditto for Salary

category
Arts, Humanities and Others    0.664353
Business                       1.270883
Education                      0.364467
Sci_Eng_Related                1.063046
Science and Engineering        1.637250
dtype: float64

In [62]:
def weight_degrees(df, bachelor_col, edu_score, sal_score):
    norm = np.linalg.norm(df[bachelor_col])
    df[bachelor_col] = sal_score + edu_score + (df[bachelor_col]/norm)
    ## adding together two normalized weights
    return df

In [63]:

bachelor_ratio = weight_degrees(bachelor_ratio, 'AHO_score', Education_Scores_n[0], Salary_Scores_n[0])
bachelor_ratio = weight_degrees(bachelor_ratio, 'BUS_score', Education_Scores_n[1], Salary_Scores_n[1])
bachelor_ratio = weight_degrees(bachelor_ratio, 'EDU_score', Education_Scores_n[2], Salary_Scores_n[2])
bachelor_ratio = weight_degrees(bachelor_ratio, 'SER_score', Education_Scores_n[3], Salary_Scores_n[3])
bachelor_ratio = weight_degrees(bachelor_ratio, 'SCE_score', Education_Scores_n[4], Salary_Scores_n[4])

bachelor_ratio = bachelor_ratio.reset_index()

In [64]:
fig = px.bar(bachelor_ratio, x = 'State',
                 y=['AHO_score', 'BUS_score', 'EDU_score', 'SER_score', 'SCE_score'],
                 title = 'Degree-Focus Scores (z-Score)',
                 barmode='group'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'variable=AHO_score<br>S…

In [65]:
grp_df = location_growth.merge(rent_sale_Growth, on= 'State', how='inner')
grp_df = grp_df.merge(salary_demographics ,left_on= 'State', right_on='state', how='inner')
grp_df = grp_df.merge(location_state_age_deg ,on= 'State', how='inner')
grp_df = grp_df.merge(location_state_pop ,on= 'State', how='inner')
grp_df = grp_df.merge(bachelor_ratio ,on= 'State', how='inner')

In [66]:
grp_df = grp_df[['State',  'sell_time_score',
       'pop_gr_score','sell_v_score', 'rent_v_score', 'sale_g_score',
       'rent_g_score', 'highs_score', 'rent_u25_score',
       'rent_u50_score', 'h_eq_score', 'avg_salary_score',
       'overpop_score', 'AHO_score', 'BUS_score', 'EDU_score', 'SER_score', 'SCE_score']]
grp_df = grp_df.set_index('State')
grp_df

Unnamed: 0_level_0,sell_time_score,pop_gr_score,sell_v_score,rent_v_score,sale_g_score,rent_g_score,highs_score,rent_u25_score,rent_u50_score,h_eq_score,avg_salary_score,overpop_score,AHO_score,BUS_score,EDU_score,SER_score,SCE_score
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
California,0.547836,-0.630217,2.617332,2.225856,-0.010723,-0.3371,-1.570775,-1.638142,-1.482109,1.006875,4.64114,-4.154968,1.959596,2.632341,0.50613,1.408413,3.376081
Texas,2.085824,1.240328,-0.567123,0.085856,0.381749,0.065284,-1.767097,0.285358,0.364697,-1.714298,2.513891,-2.77304,1.665336,2.865215,0.725641,1.518837,3.17718
New York,0.235937,-1.304683,2.196393,2.225856,-0.581351,-1.538014,-0.450317,-0.913737,-1.57652,0.150102,2.158384,-2.119508,2.024919,2.652205,0.674715,1.494744,3.158958
Florida,0.078055,1.132966,-0.355359,0.559034,-0.202363,-0.872742,-0.211771,-1.657454,-1.461272,0.004593,1.918057,-1.648871,1.662257,2.917033,0.764518,1.620343,3.077573
Illinois,0.350658,-1.441229,-0.270286,0.549523,-0.791744,-1.287093,-0.166482,-0.153863,-0.496319,0.329944,0.960045,-1.550732,1.820129,2.775345,0.733548,1.554301,3.127817
Pennsylvania,-0.255404,-0.772056,-0.467194,0.085856,-0.421622,0.83566,0.408666,-0.166077,-0.257531,0.471364,0.848456,-1.080888,1.798058,2.672195,0.79919,1.655846,3.119199
Ohio,-0.500844,-0.554095,-1.022438,-0.867632,0.713391,0.016097,0.189765,0.134771,-0.155745,0.709359,0.521782,-1.250324,1.719963,2.778127,0.84096,1.70933,3.042149
Michigan,0.098271,-0.803399,-0.612183,-0.641744,1.306613,0.700704,0.297244,-0.759119,-1.325327,0.250224,0.35531,-0.798994,1.706607,2.769693,0.777251,1.658835,3.124632
Georgia,-0.240875,0.763958,0.191889,0.47819,0.983843,0.378249,-0.784366,-0.533468,-0.68039,0.062113,0.489735,-0.357894,1.726535,2.887829,0.766128,1.501398,3.10067
North Carolina,-0.723961,0.99885,-0.434314,0.085856,0.534552,1.015358,-0.500923,-0.084656,-0.066645,0.527738,0.501162,-0.367519,1.820697,2.729494,0.733834,1.56056,3.155091


In [67]:
Location_Scores = grp_df.sum(axis=1)

In [68]:
Location_Scores.sort_values(ascending = False)

State
Idaho                   20.528911
Utah                    19.091127
District of Columbia    17.338292
Colorado                16.782932
Nevada                  15.853514
Washington              14.998831
Massachusetts           14.981476
Montana                 13.635328
Minnesota               13.377026
Arizona                 13.359010
New Hampshire           13.085507
North Dakota            12.909043
Wyoming                 12.671249
South Dakota            12.099619
Oregon                  11.807379
Virginia                11.781018
North Carolina          11.485173
Hawaii                  11.181527
Nebraska                11.161013
California              11.097566
Georgia                 10.733544
Texas                   10.153638
Wisconsin               10.142796
Maine                    9.978517
Iowa                     9.807456
Tennessee                9.711642
Maryland                 9.635004
Pennsylvania             9.273718
Alaska                   9.216940
Rhode Is

only 4 states actually had a negative score
I'm glad to see the range of 20, programatically generated weights were able to stay relatively tight, with no outliers.

## Final Assessements

In [69]:
Salary_Scores = Salary_Scores.sort_values(ascending = False)
fig = px.bar(Salary_Scores,
                 title = 'Salary Scores',
             color = Salary_Scores.index
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'category=%{x}<br>value=…

In [70]:
Education_Scores = Education_Scores.sort_values(ascending = False)
fig = px.bar(Education_Scores,
                 title = 'Education Scores',
             color = Education_Scores.index
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'category=%{x}<br>value=…

In [71]:
Location_Scores = Location_Scores.sort_values(ascending = False)
fig = px.bar(Location_Scores,
                 title = 'Location Scores',
             color = Location_Scores.index
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'State=%{x}<br>value=%{y…

In [72]:
Location_Scores = Location_Scores.reset_index()
Location_Scores = tfd.add_state_region(Location_Scores, 'State')
Location_Scores = Location_Scores.set_index('State')
Location_Scores.columns = [ 'score', 'region']
Location_Scores = Location_Scores[['score','region']].sort_values('score', ascending = False)

In [73]:
fig = px.bar(Location_Scores, x= Location_Scores.index,
             y= Location_Scores['score'],
             title = 'Location Scores',
             color = 'region'
            )
fig['layout'].update(height=600)
fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'region=Western<br>State…

In [74]:
Location_Scores.groupby('region').mean()

Unnamed: 0_level_0,score
region,Unnamed: 1_level_1
Midwestern,9.7909
Northeastern,10.118727
Southern,8.040186
Western,13.49519


Regionally, you're likely to find your happy trifecta in a Western state, they had the highest average score by almost 5 and a half points over the Southern States.

The top 10 States, regardless of region, are:
* Idaho                   20.5
* Utah                    19.0
* District of Columbia    17.3
* Colorado                16.7
* Nevada                  15.8
* Washington              14.9
* Massachusetts           14.9
* Montana                 13.6
* Minnesota               13.3
* Arizona                 13.3