In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [178]:
majors = pd.read_csv('college_majors1.csv')

In [179]:
majors.drop(columns=['Unnamed: 0'], inplace=True)
majors.head()

Unnamed: 0,Major,Major_category,Grad_total,Grad_unemployment_rate,Grad_median,Nongrad_unemployment_rate,Grad_share,Grad_premium,ShareWomen,Unemployment_rate,Median,College_jobs
0,CONSTRUCTION SERVICES,Industrial Arts & Consumer Services,9173,0.087543,75000.0,0.050661,0.09632,0.153846,0.090713,0.060023,50000,3275
1,COMMERCIAL ART AND GRAPHIC DESIGN,Arts,53864,0.057756,60000.0,0.068386,0.10442,0.25,0.690365,0.096798,35000,37389
2,HOSPITALITY MANAGEMENT,Business,24417,0.073867,65000.0,0.048423,0.119837,0.3,0.65166,0.061169,33000,2325
3,COSMETOLOGY SERVICES AND CULINARY ARTS,Industrial Arts & Consumer Services,5411,0.080901,47000.0,0.0529,0.125878,0.129808,0.584776,0.055677,29000,563
4,COMMUNICATION TECHNOLOGIES,Computers & Mathematics,9109,0.058411,57000.0,0.0728,0.144753,0.096154,0.366177,0.119511,35000,4545


In [180]:
majors.info() # 2 non-numeric columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Major                      172 non-null    object 
 1   Major_category             172 non-null    object 
 2   Grad_total                 172 non-null    int64  
 3   Grad_unemployment_rate     172 non-null    float64
 4   Grad_median                172 non-null    float64
 5   Nongrad_unemployment_rate  172 non-null    float64
 6   Grad_share                 172 non-null    float64
 7   Grad_premium               172 non-null    float64
 8   ShareWomen                 172 non-null    float64
 9   Unemployment_rate          172 non-null    float64
 10  Median                     172 non-null    int64  
 11  College_jobs               172 non-null    int64  
dtypes: float64(7), int64(3), object(2)
memory usage: 16.2+ KB


In [181]:
majors.drop(columns=['Major'], inplace=True)

In [182]:
# Convert categorical data (majors/major categories) to numeric
n_majors = pd.get_dummies(majors)
n_majors.head()

Unnamed: 0,Grad_total,Grad_unemployment_rate,Grad_median,Nongrad_unemployment_rate,Grad_share,Grad_premium,ShareWomen,Unemployment_rate,Median,College_jobs,...,Major_category_Education,Major_category_Engineering,Major_category_Health,Major_category_Humanities & Liberal Arts,Major_category_Industrial Arts & Consumer Services,Major_category_Interdisciplinary,Major_category_Law & Public Policy,Major_category_Physical Sciences,Major_category_Psychology & Social Work,Major_category_Social Science
0,9173,0.087543,75000.0,0.050661,0.09632,0.153846,0.090713,0.060023,50000,3275,...,0,0,0,0,1,0,0,0,0,0
1,53864,0.057756,60000.0,0.068386,0.10442,0.25,0.690365,0.096798,35000,37389,...,0,0,0,0,0,0,0,0,0,0
2,24417,0.073867,65000.0,0.048423,0.119837,0.3,0.65166,0.061169,33000,2325,...,0,0,0,0,0,0,0,0,0,0
3,5411,0.080901,47000.0,0.0529,0.125878,0.129808,0.584776,0.055677,29000,563,...,0,0,0,0,1,0,0,0,0,0
4,9109,0.058411,57000.0,0.0728,0.144753,0.096154,0.366177,0.119511,35000,4545,...,0,0,0,0,0,0,0,0,0,0


In [183]:
# Scaling
scaler = MinMaxScaler() # [0,1] range
s_majors = scaler.fit_transform(n_majors)
s_majors = pd.DataFrame(s_majors, columns=n_majors.columns) 

In [184]:
s_majors.head()

Unnamed: 0,Grad_total,Grad_unemployment_rate,Grad_median,Nongrad_unemployment_rate,Grad_share,Grad_premium,ShareWomen,Unemployment_rate,Median,College_jobs,...,Major_category_Education,Major_category_Engineering,Major_category_Health,Major_category_Humanities & Liberal Arts,Major_category_Industrial Arts & Consumer Services,Major_category_Interdisciplinary,Major_category_Law & Public Policy,Major_category_Physical Sciences,Major_category_Psychology & Social Work,Major_category_Social Science
0,0.006453,0.632014,0.318182,0.314847,0.0,0.106962,0.093619,0.33868,0.318182,0.021597,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.044243,0.416965,0.147727,0.425004,0.009702,0.164468,0.712485,0.54618,0.147727,0.246559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.019343,0.533277,0.204545,0.300938,0.028169,0.194371,0.67254,0.345147,0.125,0.015332,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.003272,0.584061,0.0,0.328761,0.035406,0.092585,0.603513,0.314157,0.079545,0.003713,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.006399,0.421692,0.113636,0.452438,0.058014,0.072458,0.37791,0.674343,0.147727,0.029972,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [185]:
majors.columns

Index(['Major_category', 'Grad_total', 'Grad_unemployment_rate', 'Grad_median',
       'Nongrad_unemployment_rate', 'Grad_share', 'Grad_premium', 'ShareWomen',
       'Unemployment_rate', 'Median', 'College_jobs'],
      dtype='object')

In [186]:
# Train/test dataset
X = s_majors.drop(columns=['Grad_median'])
y = s_majors['Grad_median']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [187]:
X.to_csv('X_majors.csv')
y.to_csv('y_majors.csv')