In [1]:
import pandas as pd

import Clean_data as cld
import Transform_Data as tfd
import Visualize_Data as vld
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go


### Datasets

# Job Titles By Major

In [2]:
title_maj = pd.read_csv('Datasets/PeopleDataLabs/pdl_job_titles_by_major.csv')

In [3]:
title_maj.head(1)

Unnamed: 0,major,category,count,job_title_1,job_title_1_count,job_title_2,job_title_2_count,job_title_3,job_title_3_count,job_title_4,...,job_title_6,job_title_6_count,job_title_7,job_title_7_count,job_title_8,job_title_8_count,job_title_9,job_title_9_count,job_title_10,job_title_10_count
0,3d modelling,Sci_Eng_Related,299,3d artist,18.0,graphic designer,18.0,sales associate,12.0,creative director,...,intern,7.0,owner,7.0,designer,5.0,project engineer,5.0,3d modeler,4.0


very oddly formatted, with ranked degree into job counts

first fully manual step I'm needing to take is assigning higher-level buckets to these degrees

#### Cleaned

In [4]:
title_maj =  cld.get_degree_counts()
title_maj.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totals['title'] = 'total'


Unnamed: 0,major,category,title,count
0,3d modelling,Sci_Eng_Related,3d artist,18.0
1,3d modelling,Sci_Eng_Related,3d modeler,4.0
2,3d modelling,Sci_Eng_Related,architect,9.0
3,3d modelling,Sci_Eng_Related,creative director,10.0
4,3d modelling,Sci_Eng_Related,designer,5.0


#### Zscores

The lower the z-scores, the less variation from the mean my sample (created in vld.run_zscore) has.

Anything under .1 has low statistical variability/uncertainty, and therefore high confidence.

In [5]:
vld.run_zscore(title_maj, ['count'])

full length  11496
sample size  766
df Z-Scores
count   -0.033913
dtype: float64


#### Graphs

In [6]:
title_maj_grp = title_maj.groupby('category').sum()

fig = px.bar(title_maj_grp, x = title_maj_grp.index, y='count',
            title = 'Count(Millions) of Jobs obtained by degree Category'
            )

fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'alignmentgroup': 'True',
              'hovertemplate': 'category=%{x}<br>count=…

# Degrees that Pay Back

In [7]:

payback_df = pd.read_csv('Datasets/Education Salaries/degrees-that-pay-back_edited.csv' )
payback_df.head(1)

Unnamed: 0,Undergraduate Major,Category,Starting Median Salary,Mid-Career Median Salary,Percent change from Starting to Mid-Career Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Accounting,Business,46000,77100,0.676,42200,56100,108000,152000


#### Cleaned

In [8]:
payback_df = cld.get_degrees_pay_back()
payback_df.head(5)

Unnamed: 0,Undergraduate Major,Category,Starting Median Salary,Mid-Career Median Salary
0,Accounting,Business,46000,77100
1,Aerospace Engineering,Science and Engineering,57700,101000
2,Agriculture,Sci_Eng_Related,42600,71900
3,Anthropology,Sci_Eng_Related,36800,61500
4,Architecture,Sci_Eng_Related,41600,76800


#### Zscores

In [9]:
vld.run_zscore(payback_df, ['Starting Median Salary', 'Mid-Career Median Salary'])

full length  50
sample size  3
df Z-Scores
Starting Median Salary      0.718336
Mid-Career Median Salary    0.572246
dtype: float64


the payback dataset has much higher variability, which makes absolute sense since on top of higher min and max values, it's coming from a dataset of 50 rows.

#### Graphs

In [10]:

fig = px.scatter(payback_df, x = 'Starting Median Salary', y='Mid-Career Median Salary',
            title = 'Starting Wage (x) vs Mid-Career Wage (y)',
                 color = payback_df['Category'],
                 trendline = 'ols'
            )

fig_widget = go.FigureWidget(fig)
fig_widget

FigureWidget({
    'data': [{'hovertemplate': ('Category=Business<br>Starting ' ... 'ian Salary=%{y}<extra></e…

Education and The Arts/Humanities etc have the lowest room for growth  and low mid-career caps

The other three though all have decently high room for growth. The zscores I calculated earlier do make sense, given some of the scattershot I'm seeing in this graph.

# Salaries by Region

In [11]:

reg_salaries = pd.read_csv('Datasets/Education Salaries/salaries-by-region.csv')
reg_salaries.head(3)

Unnamed: 0,School Name,Region,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Stanford University,California,70400.0,129000.0,68400.0,93100.0,184000.0,257000.0
1,California Institute of Technology (CIT),California,75500.0,123000.0,,104000.0,161000.0,
2,Harvey Mudd College,California,71800.0,122000.0,,96000.0,180000.0,


The Regions, while useful, need a little cleaning.

#### Cleaned

In [12]:
u_reg_salaries = cld.get_ungrouped_regional_salaries()
u_reg_salaries.head(5)

Unnamed: 0,School Name,Region,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Stanford University,Western,70400.0,129000.0,68400.0,93100.0,184000.0,257000.0
1,California Institute of Technology (CIT),Western,75500.0,123000.0,,104000.0,161000.0,
2,Harvey Mudd College,Western,71800.0,122000.0,,96000.0,180000.0,
3,"University of California, Berkeley",Western,59900.0,112000.0,59500.0,81000.0,149000.0,201000.0
4,Occidental College,Western,51900.0,105000.0,,54800.0,157000.0,


In [13]:
reg_salaries = tfd.get_regional_salaries(u_reg_salaries)
reg_salaries.head(5)

Unnamed: 0_level_0,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Midwestern,44225.352113,78180.28169,43076.5625,57026.760563,107594.366197,147689.0625
Northeastern,48496.0,91352.0,49101.219512,65479.0,129576.0,181926.829268
Southern,44521.518987,79505.063291,43074.647887,57506.329114,109662.025316,152769.014085
Western,47061.428571,84172.857143,44867.857143,60810.0,114555.714286,153285.714286


#### Zscores

In [14]:
u_reg_salaries = u_reg_salaries.dropna(axis = 0)
vld.run_zscore(u_reg_salaries, ['Starting Median Salary',
       'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary',
       'Mid-Career 25th Percentile Salary',
       'Mid-Career 75th Percentile Salary',
       'Mid-Career 90th Percentile Salary'])

full length  273
sample size  18
df Z-Scores
Starting Median Salary              -0.090604
Mid-Career Median Salary            -0.124353
Mid-Career 10th Percentile Salary   -0.053663
Mid-Career 25th Percentile Salary    0.007419
Mid-Career 75th Percentile Salary   -0.267477
Mid-Career 90th Percentile Salary   -0.277273
dtype: float64


there does seem to be a good amount of variation in this dataset, though the size of the set does lend itself to a good amount of variability.

#### Graphs

In [15]:

fig = px.bar(reg_salaries,x = reg_salaries.columns,x = reg_salaries.index,
            title = 'State Starting Salaries', 
             barmode = 'group'
            )

fig_widget = go.FigureWidget(fig)
fig_widget

SyntaxError: keyword argument repeated: x (Temp/ipykernel_5560/3061731404.py, line 1)

Southern and Midwestern Regions paid the lowest, up until the 90th percentiles, where Southern regions paid  an average of 5,000 dollars more annually.

To capture the truest reflection of a typical worker looking for a job, I will not be using the last 4 percentile-based statistics.

Also, based on the way this is now grouped, it's a better fit for salary going forward

#### New Data

In [None]:
reg_salaries.to_csv('Final_Data/Further_Datasets/Regional_Salaries.csv')

# Salaries by College Type

In [None]:

col_type_salaries = pd.read_csv('Datasets/Education Salaries/salaries-by-college-type.csv')
col_type_salaries.head(3)

In [None]:
col_type_salaries['School Type'].value_counts()

This unfortunately isn't the *type* of college I was anticipating. I would cherry pick liberal arts and engineering but... that *would* be cherry picking

#### Cleaned

Can't, Won't

#### Zscores

#### Graphs