In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression

In [5]:
majors = pd.read_csv('college_majors.csv')

In [6]:
majors.drop(columns=['Unnamed: 0'], inplace=True)
majors.head()

Unnamed: 0,Major,Major_category,Grad_total,Grad_sample_size,Grad_employed,Grad_full_time_year_round,Grad_unemployed,Grad_unemployment_rate,Grad_median,Grad_P25,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,CONSTRUCTION SERVICES,Industrial Arts & Consumer Services,9173,200,7098,6511,681,0.087543,75000.0,53000,...,1751,12313,1042,0.060023,50000,36000,60000,3275,5351,703
1,COMMERCIAL ART AND GRAPHIC DESIGN,Arts,53864,882,40492,29553,2482,0.057756,60000.0,40000,...,24387,52243,8947,0.096798,35000,25000,45000,37389,38119,14839
2,HOSPITALITY MANAGEMENT,Business,24417,437,18368,14784,1465,0.073867,65000.0,45000,...,7494,23106,2393,0.061169,33000,25000,42000,2325,23341,9063
3,COSMETOLOGY SERVICES AND CULINARY ARTS,Industrial Arts & Consumer Services,5411,72,3590,2701,316,0.080901,47000.0,24500,...,2064,5949,510,0.055677,29000,20000,36000,563,7384,3163
4,COMMUNICATION TECHNOLOGIES,Computers & Mathematics,9109,171,7512,5622,466,0.058411,57000.0,40600,...,4690,9085,2006,0.119511,35000,25000,45000,4545,8794,2495


In [7]:
majors.info() # 2 non-numeric columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 38 columns):
Major                           172 non-null object
Major_category                  172 non-null object
Grad_total                      172 non-null int64
Grad_sample_size                172 non-null int64
Grad_employed                   172 non-null int64
Grad_full_time_year_round       172 non-null int64
Grad_unemployed                 172 non-null int64
Grad_unemployment_rate          172 non-null float64
Grad_median                     172 non-null float64
Grad_P25                        172 non-null int64
Grad_P75                        172 non-null float64
Nongrad_total                   172 non-null int64
Nongrad_employed                172 non-null int64
Nongrad_full_time_year_round    172 non-null int64
Nongrad_unemployed              172 non-null int64
Nongrad_unemployment_rate       172 non-null float64
Nongrad_median                  172 non-null float64
Nongrad_P25     

In [8]:
# Convert categorical data (majors/major categories) to numeric
n_majors = pd.get_dummies(majors)
n_majors.head()

Unnamed: 0,Grad_total,Grad_sample_size,Grad_employed,Grad_full_time_year_round,Grad_unemployed,Grad_unemployment_rate,Grad_median,Grad_P25,Grad_P75,Nongrad_total,...,Major_category_Education,Major_category_Engineering,Major_category_Health,Major_category_Humanities & Liberal Arts,Major_category_Industrial Arts & Consumer Services,Major_category_Interdisciplinary,Major_category_Law & Public Policy,Major_category_Physical Sciences,Major_category_Psychology & Social Work,Major_category_Social Science
0,9173,200,7098,6511,681,0.087543,75000.0,53000,110000.0,86062,...,0,0,0,0,1,0,0,0,0,0
1,53864,882,40492,29553,2482,0.057756,60000.0,40000,89000.0,461977,...,0,0,0,0,0,0,0,0,0,0
2,24417,437,18368,14784,1465,0.073867,65000.0,45000,100000.0,179335,...,0,0,0,0,0,0,0,0,0,0
3,5411,72,3590,2701,316,0.080901,47000.0,24500,85000.0,37575,...,0,0,0,0,1,0,0,0,0,0
4,9109,171,7512,5622,466,0.058411,57000.0,40600,83700.0,53819,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Scaling
scaler = StandardScaler()
s_majors = scaler.fit_transform(n_majors)
s_majors = pd.DataFrame(s_majors, columns=n_majors.columns) 

In [13]:
s_majors.head()

Unnamed: 0,Grad_total,Grad_sample_size,Grad_employed,Grad_full_time_year_round,Grad_unemployed,Grad_unemployment_rate,Grad_median,Grad_P25,Grad_P75,Nongrad_total,...,Major_category_Education,Major_category_Engineering,Major_category_Health,Major_category_Humanities & Liberal Arts,Major_category_Industrial Arts & Consumer Services,Major_category_Interdisciplinary,Major_category_Law & Public Policy,Major_category_Physical Sciences,Major_category_Psychology & Social Work,Major_category_Social Science
0,-0.543151,-0.542327,-0.547771,-0.542257,-0.481568,2.525611,-0.105477,0.035649,-0.069367,-0.324913,...,-0.320256,-0.45033,-0.273861,-0.309098,4.855042,-0.076472,-0.173032,-0.248452,-0.234978,-0.234978
1,-0.339437,-0.36299,-0.338531,-0.355006,-0.176573,0.963592,-0.992621,-1.157555,-0.763212,0.616261,...,-0.320256,-0.45033,-0.273861,-0.309098,-0.205971,-0.076472,-0.173032,-0.248452,-0.234978,-0.234978
2,-0.473665,-0.480006,-0.477155,-0.475026,-0.3488,1.808428,-0.696907,-0.698631,-0.39977,-0.091387,...,-0.320256,-0.45033,-0.273861,-0.309098,-0.205971,-0.076472,-0.173032,-0.248452,-0.234978,-0.234978
3,-0.5603,-0.575986,-0.569751,-0.573219,-0.54338,2.177302,-1.761479,-2.580222,-0.895373,-0.44631,...,-0.320256,-0.45033,-0.273861,-0.309098,4.855042,-0.076472,-0.173032,-0.248452,-0.234978,-0.234978
4,-0.543443,-0.549953,-0.545177,-0.549482,-0.517978,0.997928,-1.17005,-1.102484,-0.938325,-0.40564,...,-0.320256,-0.45033,-0.273861,-0.309098,-0.205971,-0.076472,-0.173032,-0.248452,-0.234978,-0.234978


In [16]:
majors.columns

Index(['Major', 'Major_category', 'Grad_total', 'Grad_sample_size',
       'Grad_employed', 'Grad_full_time_year_round', 'Grad_unemployed',
       'Grad_unemployment_rate', 'Grad_median', 'Grad_P25', 'Grad_P75',
       'Nongrad_total', 'Nongrad_employed', 'Nongrad_full_time_year_round',
       'Nongrad_unemployed', 'Nongrad_unemployment_rate', 'Nongrad_median',
       'Nongrad_P25', 'Nongrad_P75', 'Grad_share', 'Grad_premium', 'Total',
       'Men', 'Women', 'ShareWomen', 'Sample_size', 'Employed', 'Full_time',
       'Part_time', 'Full_time_year_round', 'Unemployed', 'Unemployment_rate',
       'Median', 'P25th', 'P75th', 'College_jobs', 'Non_college_jobs',
       'Low_wage_jobs'],
      dtype='object')

In [17]:
# Train/test dataset
X = s_majors.drop(columns=['Grad_median'])
y = s_majors['Grad_median']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)