In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import Clean_data as cld
import Transform_Data as tfd
import Visualize_Data as vld


# Exploring an Ideal Job, based on Location

## The Story So far
I have gone through my datasets, starting with 30 sets, which resulted in around 40 extrapolated discoveries.

From those 40 sets I have pruned back to less than 10 sets with a much narrower focus on:

Region: How enticing a particular area is for a prospective worker
this is based on:
* Affordability
* Household Income
* Opportunities
* Average Debt
* Rent/Mortgage to Income Ratio

Field: A combined look at degrees and industries that are likely to help secure good financial standing
This is currently based on the demand for the field as well as the salary.


## Exploration of datasets

In [2]:
field_starting_salaries = pd.read_csv('Final_Data/field_starting_salaries.csv')
region_starting_salaries = pd.read_csv('Final_Data/Regional_starting_salaries.csv')
state_field_scores = pd.read_csv('Final_Data/state_field_scores.csv')
state_rank_grp= pd.read_csv('Final_Data/Grouped_State_Ranks.csv')
df =  pd.read_csv('Final_Data/Full_Weighted_States.csv')

In [3]:
def run_zscore(df, cols):
    print(df.columns)
    df = df[cols]
    length = len(df)
    samp = int(length/15)
    print('full length ',length)
    print('sample size ',samp)
    print('zscore ',stats.zscore(df).sample(samp).mean())

In [4]:
field_starting_salaries.head(3)
field_starting_salaries.columns = ['field', 'field_Starting Median Salary', 'field_Mid-Career Median Salary',
                                   'Percent change from Starting to Mid-Career Salary','field_start_over_min_w', 'field_mid_over_min_w']
field_starting_salaries = field_starting_salaries[['field', 'field_Starting Median Salary', 'field_Mid-Career Median Salary','field_start_over_min_w', 'field_mid_over_min_w']]

In [5]:
region_starting_salaries.columns = ['Region', 'region_Starting Median Salary', 'region_Mid-Career Median Salary','region_start_over_min_w', 'region_mid_over_min_w']

In [6]:
state_field_scores.head(3)

Unnamed: 0,state,field,ratio,score
0,District of Columbia,"Science, Technology, Engineering",0.485053,24.075448
1,Washington,"Science, Technology, Engineering",0.421292,23.934516
2,California,"Science, Technology, Engineering",0.416775,23.923738


In [7]:
run_zscore(state_field_scores, ['ratio', 'score'])

Index(['state', 'field', 'ratio', 'score'], dtype='object')


KeyError: "['scores'] not in index"

In [None]:
state_rank_grp.head(3)

In [None]:
df = df[['StateAbbreviation', 'State']]

In [None]:
df.head(3)

With my top-level dataframe in hand, it's time to pair some data

In [None]:
df = df.merge(state_field_scores, left_on='State', right_on='state')
df = df.merge(field_starting_salaries, left_on='field', right_on='field')
df = df.merge(state_rank_grp, left_on='State', right_on='State')
df = df.merge(region_starting_salaries, left_on='Region', right_on='Region')

In [None]:

df = df.dropna()

In [None]:
df['score'] =(df['score'].rank(method='dense')) ** 1/3
df['field_start_over_min_w'] = df['field_start_over_min_w'].rank(method='dense')
df['field_mid_over_min_w'] = df['field_start_over_min_w'].rank(method='dense')
df['summed'] = df['summed'].rank(method='dense')
df['region_start_over_min_w'] = df['region_start_over_min_w'].rank(method='dense')
df['region_mid_over_min_w'] = df['region_mid_over_min_w'].rank(method='dense')

df = df[['State', 'field', 'Region','field_start_over_min_w','field_mid_over_min_w', 'summed', 'region_start_over_min_w','region_mid_over_min_w', 'score']]
df['final_score'] = df['field_start_over_min_w']*df['field_mid_over_min_w'] *df['summed'] * df['region_start_over_min_w'] * df['region_mid_over_min_w']*df['score']
df['final_score']  = np.log(df['final_score'])

In [None]:
df.head(3)

In [None]:
cols = ['score', 'field_start_over_w', 'field_mid_over_min_w','region_start_over_w', 'region_mid_over_min_w','final_score']
run_zscore(df, cols)

In [None]:
df_scored =df[['State', 'field', 'Region','final_score']]

In [None]:
df_scored.sort_values(by='final_score', ascending= False).head(20)

After totaling, combining and scoring, it still appears that pennsylvania takes the lead

In [None]:
#Create
fig, ax = plt.subplots()
#Data
df_h = df_scored[df_scored['final_score'] > 12]
x1 = df_h['State']
y1 =df_h['final_score'] 
#Appearance

ax.bar(x1, y1,label='Regions')
ax.set_title("State Score")  # Add a title to the axes.
fig.set_size_inches(18.5, 10.5)
plt.xticks(rotation=45)
plt.show()

In [None]:
df_scored.groupby('Region').mean()