In [1]:
import pandas as pd
import numpy as np

In [2]:
# this data is from last year's DS 4400 class
gtky = pd.read_csv('ds4400_clean_gtky.csv')
gtky.head()

Unnamed: 0,class,cs_ds,co_op,credit_hours,work_hours
0,Senior,DS,Yes,17,12
1,Sophomore,CS,No,17,0
2,Senior,DS,No,17,15
3,Senior,CS,Yes,16,3
4,Junior,CS,No,16,0


In [3]:
# One-hot encoding the categorical features
# note that it will make the "baseline" category the first alphabetical value
onehot_class = pd.get_dummies(gtky['class'], drop_first=True, dtype = int)
onehot_class.head()

Unnamed: 0,Senior,Sophomore
0,1,0
1,0,1
2,1,0
3,1,0
4,0,0


In [4]:
# do it for the full data set
onehot_gtky = pd.get_dummies(gtky, drop_first=True, dtype = int)
onehot_gtky.head()

Unnamed: 0,credit_hours,work_hours,class_Senior,class_Sophomore,cs_ds_CY,cs_ds_DS,co_op_Yes
0,17,12,1,0,0,1,1
1,17,0,0,1,0,0,0
2,17,15,1,0,0,1,0
3,16,3,1,0,0,0,1
4,16,0,0,0,0,0,0


In [5]:
# turn it into a numpy array for ML purposes
onehotnp_gtky = onehot_gtky.to_numpy()
onehotnp_gtky

array([[17, 12,  1,  0,  0,  1,  1],
       [17,  0,  0,  1,  0,  0,  0],
       [17, 15,  1,  0,  0,  1,  0],
       [16,  3,  1,  0,  0,  0,  1],
       [16,  0,  0,  0,  0,  0,  0],
       [17,  0,  0,  0,  1,  0,  0],
       [12,  5,  0,  0,  0,  0,  1],
       [16, 25,  1,  0,  0,  1,  1],
       [12,  8,  1,  0,  0,  0,  1],
       [17,  0,  0,  0,  0,  0,  1],
       [18,  0,  0,  0,  0,  1,  0],
       [17,  0,  1,  0,  0,  1,  0],
       [16,  5,  0,  0,  0,  0,  1],
       [18,  0,  1,  0,  0,  0,  1],
       [16, 12,  1,  0,  0,  0,  1],
       [17, 15,  0,  0,  0,  0,  1],
       [16,  0,  1,  0,  0,  0,  0],
       [16,  0,  0,  0,  0,  1,  0],
       [18, 10,  1,  0,  0,  0,  1],
       [16, 10,  1,  0,  0,  0,  1],
       [17, 10,  0,  0,  0,  0,  0],
       [16,  0,  1,  0,  0,  0,  1],
       [16, 20,  0,  0,  0,  1,  0],
       [17,  0,  0,  0,  0,  0,  0],
       [17, 20,  0,  0,  0,  1,  1],
       [16,  0,  1,  0,  0,  1,  1],
       [18,  0,  0,  1,  0,  0,  0]], 

In [6]:
# Standardization (data centering and scaling)
from sklearn import preprocessing

X = np.array([[1, 1, 1000],
              [2, 2, 850],
              [2, 3, 1400],
              [1, 1, 800],
              [4, 2, 1050]])
X

array([[   1,    1, 1000],
       [   2,    2,  850],
       [   2,    3, 1400],
       [   1,    1,  800],
       [   4,    2, 1050]])

In [7]:
X = preprocessing.scale(X)
X

array([[-0.91287093, -1.06904497, -0.09470274],
       [ 0.        ,  0.26726124, -0.80497333],
       [ 0.        ,  1.60356745,  1.79935215],
       [-0.91287093, -1.06904497, -1.04173019],
       [ 1.82574186,  0.26726124,  0.14205412]])

In [8]:
# check that our data now have mean 0 and sd 1 for all columns
np.mean(X, axis=0)

array([ 0.00000000e+00, -8.88178420e-17,  3.33066907e-17])

In [9]:
np.std(X, axis=0)

array([1., 1., 1.])

In [10]:
# min-max scaling
from sklearn.preprocessing import MinMaxScaler

X = onehotnp_gtky
scaler = MinMaxScaler()
X_scale = scaler.fit_transform(X).round(2)
X_scale

array([[0.83, 0.48, 1.  , 0.  , 0.  , 1.  , 1.  ],
       [0.83, 0.  , 0.  , 1.  , 0.  , 0.  , 0.  ],
       [0.83, 0.6 , 1.  , 0.  , 0.  , 1.  , 0.  ],
       [0.67, 0.12, 1.  , 0.  , 0.  , 0.  , 1.  ],
       [0.67, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.83, 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.  , 0.  , 0.  , 0.  , 1.  ],
       [0.67, 1.  , 1.  , 0.  , 0.  , 1.  , 1.  ],
       [0.  , 0.32, 1.  , 0.  , 0.  , 0.  , 1.  ],
       [0.83, 0.  , 0.  , 0.  , 0.  , 0.  , 1.  ],
       [1.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  ],
       [0.83, 0.  , 1.  , 0.  , 0.  , 1.  , 0.  ],
       [0.67, 0.2 , 0.  , 0.  , 0.  , 0.  , 1.  ],
       [1.  , 0.  , 1.  , 0.  , 0.  , 0.  , 1.  ],
       [0.67, 0.48, 1.  , 0.  , 0.  , 0.  , 1.  ],
       [0.83, 0.6 , 0.  , 0.  , 0.  , 0.  , 1.  ],
       [0.67, 0.  , 1.  , 0.  , 0.  , 0.  , 0.  ],
       [0.67, 0.  , 0.  , 0.  , 0.  , 1.  , 0.  ],
       [1.  , 0.4 , 1.  , 0.  , 0.  , 0.  , 1.  ],
       [0.67, 0.4 , 1.  , 0.  ,

In [11]:
# Separating data into training and test sets for cross validation
# Imagine we want to predict if a student has done their co-op with all the other features
from sklearn.model_selection import train_test_split

Phi = X_scale[:,:-1]
y = X_scale[:,-1]
[Phi_train, Phi_test, y_train, y_test] = train_test_split(Phi, y, test_size = .3)

# we would then train the model on the training set
Phi_train, y_train

(array([[1.  , 0.  , 0.  , 1.  , 0.  , 0.  ],
        [1.  , 0.  , 0.  , 0.  , 0.  , 1.  ],
        [1.  , 0.4 , 1.  , 0.  , 0.  , 0.  ],
        [0.  , 0.2 , 0.  , 0.  , 0.  , 0.  ],
        [0.83, 0.4 , 0.  , 0.  , 0.  , 0.  ],
        [0.67, 0.8 , 0.  , 0.  , 0.  , 1.  ],
        [0.67, 0.  , 0.  , 0.  , 0.  , 1.  ],
        [0.83, 0.6 , 1.  , 0.  , 0.  , 1.  ],
        [0.83, 0.  , 0.  , 1.  , 0.  , 0.  ],
        [0.83, 0.  , 1.  , 0.  , 0.  , 1.  ],
        [0.67, 0.2 , 0.  , 0.  , 0.  , 0.  ],
        [0.83, 0.  , 0.  , 0.  , 0.  , 0.  ],
        [0.67, 1.  , 1.  , 0.  , 0.  , 1.  ],
        [0.83, 0.  , 0.  , 0.  , 0.  , 0.  ],
        [0.  , 0.32, 1.  , 0.  , 0.  , 0.  ],
        [0.67, 0.  , 1.  , 0.  , 0.  , 0.  ],
        [0.67, 0.  , 1.  , 0.  , 0.  , 1.  ],
        [0.83, 0.  , 0.  , 0.  , 1.  , 0.  ]]),
 array([0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1.,
        0.]))

In [12]:
# and predict with the test set
Phi_test, y_test

(array([[0.67, 0.48, 1.  , 0.  , 0.  , 0.  ],
        [0.83, 0.48, 1.  , 0.  , 0.  , 1.  ],
        [0.67, 0.  , 1.  , 0.  , 0.  , 0.  ],
        [0.67, 0.4 , 1.  , 0.  , 0.  , 0.  ],
        [1.  , 0.  , 1.  , 0.  , 0.  , 0.  ],
        [0.83, 0.8 , 0.  , 0.  , 0.  , 1.  ],
        [0.67, 0.  , 0.  , 0.  , 0.  , 0.  ],
        [0.83, 0.6 , 0.  , 0.  , 0.  , 0.  ],
        [0.67, 0.12, 1.  , 0.  , 0.  , 0.  ]]),
 array([1., 1., 0., 1., 1., 1., 0., 1., 1.]))