In [1]:
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:,.2f}'.format

df = pd.read_csv(r'salaries_by_college_major.csv')
df.head()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,Business


In [2]:
df.isna().sum()

Undergraduate Major                  0
Starting Median Salary               1
Mid-Career Median Salary             1
Mid-Career 10th Percentile Salary    1
Mid-Career 90th Percentile Salary    1
Group                                1
dtype: int64

In [3]:
clean_df = df.dropna()
clean_df.head()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,Business


In [4]:
# which college major has the highest paying starting salary?
loc = clean_df['Starting Median Salary'].idxmax()
df.loc[loc]

Undergraduate Major                  Physician Assistant
Starting Median Salary                         74,300.00
Mid-Career Median Salary                       91,700.00
Mid-Career 10th Percentile Salary              66,400.00
Mid-Career 90th Percentile Salary             124,000.00
Group                                               STEM
Name: 43, dtype: object

In [5]:
# What college major has the highest mid-career salary? How much do graduates with this major earn? (Mid-career is defined as having 10+ years of experience).

loc_mid_high = clean_df['Mid-Career Median Salary'].idxmax()
df.loc[loc_mid_high]

Undergraduate Major                  Chemical Engineering
Starting Median Salary                          63,200.00
Mid-Career Median Salary                       107,000.00
Mid-Career 10th Percentile Salary               71,900.00
Mid-Career 90th Percentile Salary              194,000.00
Group                                                STEM
Name: 8, dtype: object

In [6]:
# Which college major has the lowest starting salary and how much do graduates earn after university?

loc_starting_low = df['Starting Median Salary'].idxmin()
df.loc[loc_starting_low]


Undergraduate Major                   Spanish
Starting Median Salary              34,000.00
Mid-Career Median Salary            53,100.00
Mid-Career 10th Percentile Salary   31,000.00
Mid-Career 90th Percentile Salary   96,400.00
Group                                    HASS
Name: 49, dtype: object

In [7]:
# Which college major has the lowest mid-career salary and how much can people expect to earn with this degree? 

loc_mid_low = df['Mid-Career Median Salary'].idxmin()
df.loc[loc_mid_low]

Undergraduate Major                  Education
Starting Median Salary               34,900.00
Mid-Career Median Salary             52,000.00
Mid-Career 10th Percentile Salary    29,300.00
Mid-Career 90th Percentile Salary   102,000.00
Group                                     HASS
Name: 18, dtype: object

In [8]:
# Sorting Values & Adding Columns: Majors with the Most Potential vs Lowest Risk

df[['Mid-Career 10th Percentile Salary', 'Mid-Career 90th Percentile Salary']]
spread = df['Mid-Career 90th Percentile Salary'] - df['Mid-Career 10th Percentile Salary']

In [9]:
df.insert(1, 'Spread', spread)

In [10]:
df.sort_values('Spread', ascending=True).head()
# Nursing has the lowest difference in salary therefore is a low risk profession.


Unnamed: 0,Undergraduate Major,Spread,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
40,Nursing,50700.0,54200.0,67000.0,47600.0,98300.0,Business
43,Physician Assistant,57600.0,74300.0,91700.0,66400.0,124000.0,STEM
41,Nutrition,65300.0,39900.0,55300.0,33900.0,99200.0,HASS
49,Spanish,65400.0,34000.0,53100.0,31000.0,96400.0,HASS
27,Health Care Administration,66400.0,38800.0,60600.0,34600.0,101000.0,Business


In [11]:
df.sort_values('Spread', ascending=False).head()
# Economics is the highest difference in salary therefore is a high risk profession.



Unnamed: 0,Undergraduate Major,Spread,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
17,Economics,159400.0,50100.0,98600.0,50600.0,210000.0,Business
22,Finance,147800.0,47900.0,88300.0,47200.0,195000.0,Business
37,Math,137800.0,45400.0,92400.0,45200.0,183000.0,STEM
36,Marketing,132900.0,40800.0,79600.0,42100.0,175000.0,Business
42,Philosophy,132500.0,39900.0,81200.0,35500.0,168000.0,HASS


In [12]:
# Pivoting data

clean_df.groupby('Group').mean()

Unnamed: 0_level_0,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business,44633.33,75083.33,43566.67,147525.0
HASS,37186.36,62968.18,34145.45,129363.64
STEM,53862.5,90812.5,56025.0,157625.0
