In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [13]:
majors = pd.read_csv('college_majors1.csv')

In [14]:
majors.drop(columns=['Unnamed: 0'], inplace=True)
majors.head()

Unnamed: 0,Major,Major_category,Grad_total,Grad_unemployment_rate,Grad_median,Nongrad_unemployment_rate,Grad_share,Grad_premium,ShareWomen,Unemployment_rate
0,CONSTRUCTION SERVICES,Industrial Arts & Consumer Services,9173,0.087543,75000.0,0.050661,0.09632,0.153846,0.090713,0.060023
1,COMMERCIAL ART AND GRAPHIC DESIGN,Arts,53864,0.057756,60000.0,0.068386,0.10442,0.25,0.690365,0.096798
2,HOSPITALITY MANAGEMENT,Business,24417,0.073867,65000.0,0.048423,0.119837,0.3,0.65166,0.061169
3,COSMETOLOGY SERVICES AND CULINARY ARTS,Industrial Arts & Consumer Services,5411,0.080901,47000.0,0.0529,0.125878,0.129808,0.584776,0.055677
4,COMMUNICATION TECHNOLOGIES,Computers & Mathematics,9109,0.058411,57000.0,0.0728,0.144753,0.096154,0.366177,0.119511


In [15]:
majors.info() # 2 non-numeric columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 10 columns):
Major                        172 non-null object
Major_category               172 non-null object
Grad_total                   172 non-null int64
Grad_unemployment_rate       172 non-null float64
Grad_median                  172 non-null float64
Nongrad_unemployment_rate    172 non-null float64
Grad_share                   172 non-null float64
Grad_premium                 172 non-null float64
ShareWomen                   172 non-null float64
Unemployment_rate            172 non-null float64
dtypes: float64(7), int64(1), object(2)
memory usage: 13.6+ KB


In [16]:
# Convert categorical data (majors/major categories) to numeric
n_majors = pd.get_dummies(majors)
n_majors.head()

Unnamed: 0,Grad_total,Grad_unemployment_rate,Grad_median,Nongrad_unemployment_rate,Grad_share,Grad_premium,ShareWomen,Unemployment_rate,Major_ACCOUNTING,Major_ACTUARIAL SCIENCE,...,Major_category_Education,Major_category_Engineering,Major_category_Health,Major_category_Humanities & Liberal Arts,Major_category_Industrial Arts & Consumer Services,Major_category_Interdisciplinary,Major_category_Law & Public Policy,Major_category_Physical Sciences,Major_category_Psychology & Social Work,Major_category_Social Science
0,9173,0.087543,75000.0,0.050661,0.09632,0.153846,0.090713,0.060023,0,0,...,0,0,0,0,1,0,0,0,0,0
1,53864,0.057756,60000.0,0.068386,0.10442,0.25,0.690365,0.096798,0,0,...,0,0,0,0,0,0,0,0,0,0
2,24417,0.073867,65000.0,0.048423,0.119837,0.3,0.65166,0.061169,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5411,0.080901,47000.0,0.0529,0.125878,0.129808,0.584776,0.055677,0,0,...,0,0,0,0,1,0,0,0,0,0
4,9109,0.058411,57000.0,0.0728,0.144753,0.096154,0.366177,0.119511,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Scaling
scaler = MinMaxScaler() # [0,1] range
s_majors = scaler.fit_transform(n_majors)
s_majors = pd.DataFrame(s_majors, columns=n_majors.columns) 

In [18]:
s_majors.head()

Unnamed: 0,Grad_total,Grad_unemployment_rate,Grad_median,Nongrad_unemployment_rate,Grad_share,Grad_premium,ShareWomen,Unemployment_rate,Major_ACCOUNTING,Major_ACTUARIAL SCIENCE,...,Major_category_Education,Major_category_Engineering,Major_category_Health,Major_category_Humanities & Liberal Arts,Major_category_Industrial Arts & Consumer Services,Major_category_Interdisciplinary,Major_category_Law & Public Policy,Major_category_Physical Sciences,Major_category_Psychology & Social Work,Major_category_Social Science
0,0.006453,0.632014,0.318182,0.314847,0.0,0.106962,0.093619,0.33868,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.044243,0.416965,0.147727,0.425004,0.009702,0.164468,0.712485,0.54618,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.019343,0.533277,0.204545,0.300938,0.028169,0.194371,0.67254,0.345147,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.003272,0.584061,0.0,0.328761,0.035406,0.092585,0.603513,0.314157,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.006399,0.421692,0.113636,0.452438,0.058014,0.072458,0.37791,0.674343,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
majors.columns

Index(['Major', 'Major_category', 'Grad_total', 'Grad_unemployment_rate',
       'Grad_median', 'Nongrad_unemployment_rate', 'Grad_share',
       'Grad_premium', 'ShareWomen', 'Unemployment_rate'],
      dtype='object')

In [20]:
# Train/test dataset
X = s_majors.drop(columns=['Grad_median'])
y = s_majors['Grad_median']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
X.to_csv('X_majors.csv')
y.to_csv('y_majors.csv')

  
