# Classification

We will now try this problem as a classification problem instead of regression. We will transform the target into categories, and then try classification models to see how well the target class can be predicted.

# Preliminary Steps

In [103]:
# import libraries

import warnings
warnings.filterwarnings('ignore')

# general
import numpy as np
import pandas as pd

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

# modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, \
GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# evaluation
from sklearn.metrics import classification_report, accuracy_score, \
precision_score, recall_score, ConfusionMatrixDisplay

# tuning
from sklearn.model_selection import GridSearchCV

# feature engineering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import set_config
set_config(display="text")
from kneed import KneeLocator



In [104]:
# load data
path = 'Data/all_grades_data_cleaned.csv'
df = pd.read_csv(path, index_col = 0)

# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49012 entries, 12 to 64185
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49012 non-null  object 
 1   instructor_id      49012 non-null  object 
 2   facility_code      45521 non-null  object 
 3   start_time         49012 non-null  float64
 4   mon                49012 non-null  bool   
 5   tues               49012 non-null  bool   
 6   wed                49012 non-null  bool   
 7   thurs              49012 non-null  bool   
 8   fri                49012 non-null  bool   
 9   subject_name       49012 non-null  object 
 10  course_name        49012 non-null  object 
 11  a_proportion       49012 non-null  float64
 12  f_proportion       49012 non-null  float64
 13  avg_grade          49012 non-null  float64
 14  year               49012 non-null  int64  
 15  term               49012 non-null  object 
 16  class_length       49

In [105]:
# create copy to work on
df1 = df.copy()

In [106]:
# check for duplicates
df.duplicated().sum()

0

In [107]:
# check for missing values
df.isna().sum()

# 'facility_code' is the only column with missing values; will need to impute

section_type            0
instructor_id           0
facility_code        3491
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
subject_name            0
course_name             0
a_proportion            0
f_proportion            0
avg_grade               0
year                    0
term                    0
class_length            0
total_time              0
weekend                 0
course_difficulty       0
dtype: int64

# Feature Engineering (Before Pre-Processing)

The schedule variables take up 6 columns for days of the week (Saturday and Sunday are combined in 'weekend'). However this doesn't capture the schedules of classes; there is a high probability of a class that is held on Monday also being held on Wednesday, for example. 

So, to reduce this collinearity, we will make new schedule-related columns related to the actual schedules of the courses.

## Schedule

In [108]:
# binary-encode days of week columns
binary_cols = ['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend']

for col in binary_cols:
    df1[col].replace({True: 1, False: 0}, inplace = True)
    df1[col].replace({True: 1, False: 0}, inplace = True)

# check
for col in binary_cols:
    print(df1[col].value_counts())
    print(df1[col].value_counts())

0    26559
1    22453
Name: mon, dtype: int64
0    26559
1    22453
Name: mon, dtype: int64
0    27148
1    21864
Name: tues, dtype: int64
0    27148
1    21864
Name: tues, dtype: int64
0    27180
1    21832
Name: wed, dtype: int64
0    27180
1    21832
Name: wed, dtype: int64
0    28022
1    20990
Name: thurs, dtype: int64
0    28022
1    20990
Name: thurs, dtype: int64
0    36491
1    12521
Name: fri, dtype: int64
0    36491
1    12521
Name: fri, dtype: int64
0    48747
1      265
Name: weekend, dtype: int64
0    48747
1      265
Name: weekend, dtype: int64


In [109]:
# new column: how many days per week the course is held (numeric, to be 
# one-hot encoded); weekend counts as one day (can go back to original
# data and change if needed)

df1['days_per_week'] = df1['mon'] + df1['tues'] + df1['wed'] + \
                         df1['thurs'] + df1['fri'] + df1['weekend']

# check
df1[['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend', 'days_per_week']].sample(10)

Unnamed: 0,mon,tues,wed,thurs,fri,weekend,days_per_week
25830,0,1,0,1,0,0,2
31219,0,1,0,1,0,0,2
33334,0,1,0,1,0,0,2
17041,1,0,1,0,0,0,2
5219,0,0,0,0,0,0,0
33942,1,0,1,0,1,0,3
30710,1,0,1,0,1,0,3
32779,1,0,1,0,1,0,3
30015,1,0,1,0,0,0,2
3303,0,1,0,1,0,0,2


In [110]:
# change 1s and 0s in days of week columns to abbreviations 
# (M, T, W, R, F, E)

df1['mon'].replace({1: 'M', 0: ''}, inplace = True)
df1['tues'].replace({1: 'T', 0: ''}, inplace = True)
df1['wed'].replace({1: 'W', 0: ''}, inplace = True)
df1['thurs'].replace({1: 'R', 0: ''}, inplace = True)
df1['fri'].replace({1: 'F', 0: ''}, inplace = True)
df1['weekend'].replace({1: 'E', 0: ''}, inplace = True)

# check
df1[['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend']].sample(10)

Unnamed: 0,mon,tues,wed,thurs,fri,weekend
5901,,,,,,
32563,M,,W,,F,
13323,,,,,F,
57224,,T,,,,
47793,M,T,,R,F,
5671,,,,,,
12477,,,,,,
55051,,T,,R,,
56838,,T,,,,
51025,M,,,,,


In [111]:
# concatenate days of week columns into one
df1['schedule_days'] = df1['mon'] + df1['tues'] + df1['wed'] + df1['thurs'] + df1['fri'] + df1['weekend']

# check
df1['schedule_days'].value_counts()

TR        15434
MWF        9089
MW         7137
           3576
T          2414
W          2268
M          1948
MTWR       1813
R          1492
MTWRF      1151
MTRF        870
F           869
E           245
MF          205
WF          162
MWR          88
MTWF         69
MTR          31
TRF          31
TWRF         26
MR           21
FE           18
WR           10
MWRF          9
MTF           7
MT            5
MRF           5
TF            4
RF            4
MTW           3
TWR           3
TW            2
ME            1
MTWRFE        1
WRF           1
Name: schedule_days, dtype: int64

In [112]:
# replace '' with 'none'
df1['schedule_days'] = df1['schedule_days'].apply(lambda x: 'none' if x == '' else x)

# check
df1['schedule_days'].value_counts()

TR        15434
MWF        9089
MW         7137
none       3576
T          2414
W          2268
M          1948
MTWR       1813
R          1492
MTWRF      1151
MTRF        870
F           869
E           245
MF          205
WF          162
MWR          88
MTWF         69
MTR          31
TRF          31
TWRF         26
MR           21
FE           18
WR           10
MWRF          9
MTF           7
MT            5
MRF           5
TF            4
RF            4
MTW           3
TWR           3
TW            2
ME            1
MTWRFE        1
WRF           1
Name: schedule_days, dtype: int64

In [113]:
# bin all schedules with fewer than 100 values into 'other'
keep_schedules = ['TR', 'MWF', 'MW', 'none', 'T', 'W', 'M', 'MTWR', 
                 'R', 'MTWRF', 'MTRF', 'F', 'E', 'MF', 'WF']

df1['schedule_days'] = df1['schedule_days'].apply(lambda x: x if x in keep_schedules else 'other')

# check
df1['schedule_days'].value_counts()

TR       15434
MWF       9089
MW        7137
none      3576
T         2414
W         2268
M         1948
MTWR      1813
R         1492
MTWRF     1151
MTRF       870
F          869
other      339
E          245
MF         205
WF         162
Name: schedule_days, dtype: int64

In [114]:
# drop original columns
df1.drop(columns = ['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend'], 
        inplace = True)

# check
df1.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time',
       'subject_name', 'course_name', 'a_proportion', 'f_proportion',
       'avg_grade', 'year', 'term', 'class_length', 'total_time',
       'course_difficulty', 'days_per_week', 'schedule_days'],
      dtype='object')

## Total Time

We will drop the 'total_time' column now, since it represents information that is independently coded into the 'days_per_week' and 'class_length' columns, and represents their product.

In [115]:
df1.drop(columns = 'total_time', inplace = True)

# check
df1.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time',
       'subject_name', 'course_name', 'a_proportion', 'f_proportion',
       'avg_grade', 'year', 'term', 'class_length', 'course_difficulty',
       'days_per_week', 'schedule_days'],
      dtype='object')

## Course Name

We will drop the 'course_name' column, since it has a high number of unique values. It is binned, but it is also related to 'subject_name', which also has many unique values (although binned), so we believe it contributes too much division of the data without a large amount of information to contribute to the target.

In [116]:
df1.drop(columns = 'course_name', inplace = True)

# check
df1.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time',
       'subject_name', 'a_proportion', 'f_proportion', 'avg_grade', 'year',
       'term', 'class_length', 'course_difficulty', 'days_per_week',
       'schedule_days'],
      dtype='object')

## Start Time

'start_time' is given in minutes, so it is a numerical variable. Courses without an assigned start_time are coded as -1. 'start_time' shouldn't be treated as a numeric variable since there is not a natural size order to times; for example, it's possible that courses in the afternoon give out higher grades than both courses in the mornings and evenings. So, we want to treat 'start_time' as a categorical variable.

Instead of just one-hot encoding the column as-is, we will bin courses by start_time to morning, afternoon, evening, and none, to reduce the number of dummy variables made with one-hot encoding. Then we will be able to one-hot encode this variable.

Let's check out the distribution of 'start_time's before binning.

In [117]:
# check start times without -1 (no start time)
df1[df1['start_time'] != -1].describe()

Unnamed: 0,start_time,a_proportion,f_proportion,avg_grade,year,class_length,days_per_week
count,45414.0,45414.0,45414.0,45414.0,45414.0,45414.0,45414.0
mean,730.335249,0.490533,0.008711,3.489751,12.188818,79.89948,2.19952
std,153.560322,0.282317,0.024151,0.3697,3.396662,45.183048,0.917262
min,390.0,0.0,0.0,0.0,7.0,40.0,0.0
25%,595.0,0.250253,0.0,3.244444,9.0,50.0,2.0
50%,725.0,0.4375,0.0,3.529412,12.0,75.0,2.0
75%,865.0,0.714286,0.0,3.790323,15.0,75.0,3.0
max,1260.0,1.0,1.0,4.0,18.0,600.0,6.0


Earliest start time is 6:30am (390) and latest start time is 9pm (1260).

Time bins:
- -1 = 'none'
- 390 - 479 = early morning (6:30-7:59am) (1.5 hours)
- 480 - 599 = mid-morning (8-9:59am) (2 hours)
- 600 - 719 = late morning (10-11:59am) (2 hours)
- 720 - 839 = early afternoon (noon-1:59pm) (2 hours)
- 840 - 959 = mid-afternoon (2-3:59pm) (2 hours)
- 960 - 1079 = late afternoon (4-5:59pm) (2 hours)
- 1080 - 1260 = evening (6-9pm) (3 hours)

In [118]:
for index in df1['start_time'].index:
    if df1.at[index, 'start_time'] == -1:
        df1.at[index, 'start_time'] = 'none'
    elif df1.at[index, 'start_time'] < 480:
        df1.at[index, 'start_time'] = 'early morning'
    elif df1.at[index, 'start_time'] < 600:
        df1.at[index, 'start_time'] = 'mid-morning'
    elif df1.at[index, 'start_time'] < 720:
        df1.at[index, 'start_time'] = 'late morning'
    elif df1.at[index, 'start_time'] < 840:
        df1.at[index, 'start_time'] = 'early afternoon'
    elif df1.at[index, 'start_time'] < 960:
        df1.at[index, 'start_time'] = 'mid-afternoon'
    elif df1.at[index, 'start_time'] < 1080:
        df1.at[index, 'start_time'] = 'late afternoon'
    else:
        df1.at[index, 'start_time'] = 'evening'
        
# check
df1['start_time'].value_counts()

mid-morning        12654
early afternoon    11369
late morning        8020
mid-afternoon       8006
late afternoon      3914
none                3598
early morning        755
evening              696
Name: start_time, dtype: int64

## Class Length

'class_length' is also a numeric variable that should not be numeric since it doesn't have a reasonable expectation that longer or shorter classes will always outperform the other. It may be the case that courses with a middling length may outperform both longer and shorter classes, for example.

So, we also want to one-hot encode this column, but again have too many values to one-hot encode as-is. We will look at the distribution of times in this column and bin them accordingly.

In [119]:
df1['class_length'].value_counts()

50.0     19284
75.0     15419
0.0       3598
150.0     3371
120.0     1991
         ...  
340.0        1
325.0        1
113.0        1
40.0         1
520.0        1
Name: class_length, Length: 61, dtype: int64

In [120]:
class_length_no_zero = df1[df1['class_length'] != 0]
class_length_no_zero['class_length'].describe()

count    45414.000000
mean        79.899480
std         45.183048
min         40.000000
25%         50.000000
50%         75.000000
75%         75.000000
max        600.000000
Name: class_length, dtype: float64

In [121]:
# bin into less than 75 and 75 or more
df1['class_length'] = df1['class_length'].apply(lambda x: 'shorter' 
                                               if x < 75 else 'longer')

# check
df1['class_length'].value_counts()

longer     25590
shorter    23422
Name: class_length, dtype: int64

## Target Columns

The target columns 'avg_grade', 'a_proportion', and 'f_proportion' are all numeric in this dataset, so we treated this as a regression problem (see separate notebook in this repository). In the interest of exploring classification models, for this notebook we will convert the target column 'avg_grade' into categories and try to predict its values with classification models. We will drop 'a_proportion' and 'f_proportion' for now.

In [122]:
# drop 'a_proportion' and 'f_proportion'
df1.drop(columns = ['a_proportion', 'f_proportion'], inplace = True)

# check
df1.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time',
       'subject_name', 'avg_grade', 'year', 'term', 'class_length',
       'course_difficulty', 'days_per_week', 'schedule_days'],
      dtype='object')

We will break down the target column ('avg_grade') into grade categories as follows:     
- A: x == 4.0
- AB: 3.5 <= x < 4.0
- B: 3.0 <= x < 3.5
- BC: 2.5 <= x < 3.0
- C: 2.0 <= x < 2.5
- D: 1.0 <= x < 2.0
- F: x < 1.0

Source for categories: https://registrar.wisc.edu/valid-grades/

In [125]:
# categorize 'avg_grade' column
for index in df1['avg_grade'].index:
    if df1.at[index, 'avg_grade'] == 4.0:
        df1.at[index, 'avg_grade'] = 'A'
    elif df1.at[index, 'avg_grade'] >= 3.5:
        df1.at[index, 'avg_grade'] = 'AB'
    elif df1.at[index, 'avg_grade'] >= 3.0:
        df1.at[index, 'avg_grade'] = 'B'
    elif df1.at[index, 'avg_grade'] >= 2.5:
        df1.at[index, 'avg_grade'] = 'BC'
    elif df1.at[index, 'avg_grade'] >= 2.0:
        df1.at[index, 'avg_grade'] = 'C'
    elif df1.at[index, 'avg_grade'] >= 1.0:
        df1.at[index, 'avg_grade'] = 'D'
    else:
        df1.at[index, 'avg_grade'] = 'F'

# check
df1['avg_grade'].value_counts()

AB    22165
B     16384
A      5578
BC     4532
C       325
D        27
F         1
Name: avg_grade, dtype: int64

## Final Check

In [126]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49012 entries, 12 to 64185
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   section_type       49012 non-null  object
 1   instructor_id      49012 non-null  object
 2   facility_code      45521 non-null  object
 3   start_time         49012 non-null  object
 4   subject_name       49012 non-null  object
 5   avg_grade          49012 non-null  object
 6   year               49012 non-null  int64 
 7   term               49012 non-null  object
 8   class_length       49012 non-null  object
 9   course_difficulty  49012 non-null  object
 10  days_per_week      49012 non-null  int64 
 11  schedule_days      49012 non-null  object
dtypes: int64(2), object(10)
memory usage: 5.9+ MB


In [127]:
for col in df1.columns:
    print(col)
    print(df1[col].nunique())
    print()

section_type
6

instructor_id
522

facility_code
119

start_time
8

subject_name
180

avg_grade
7

year
12

term
2

class_length
2

course_difficulty
3

days_per_week
7

schedule_days
16



We have hopefully improved our data's interpretability for the model by making these changes to our features. They are all now able to be appropriately one-hot encoded, and the largest number of unique values for any one column is 522 for 'instructor_id'. 

# Preprocessing

In [128]:
# assign X and y
target = 'avg_grade'
y = df1[target]
X = df1.drop(columns = target)

# check
print(f"y: \n{y}")
print(f"X: \n{X}")

y: 
12       AB
13        B
14       AB
15       AB
16        B
         ..
64181     B
64182     B
64183     B
64184     B
64185    AB
Name: avg_grade, Length: 49012, dtype: object
X: 
      section_type instructor_id facility_code       start_time  \
12             lec         other           NaN             none   
13             lec         other          0545     late morning   
14             lec         other          0545     late morning   
15             lec         other          0545      mid-morning   
16             lec         other          0545      mid-morning   
...            ...           ...           ...              ...   
64181          lec         other          0093  early afternoon   
64182          lec         other          0093  early afternoon   
64183          lec         other          0093  early afternoon   
64184          lec         other          0093  early afternoon   
64185          lab         other          0021     late morning   

         

In [129]:
# validate model with train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# check
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (36759, 11)
X_test shape: (12253, 11)
y_train shape: (36759,)
y_test shape: (12253,)


## One-Hot Encoded Columns

In [131]:
# make list
ohe_cols = ['section_type', 'instructor_id', 'start_time', 'subject_name',
           'year', 'term', 'class_length', 'course_difficulty', 
           'days_per_week', 'schedule_days']

# instantiate ohe
ohe1 = OneHotEncoder(handle_unknown = 'ignore', 
                     sparse = False)

# make tuple
ohe_tuple = (ohe1, ohe_cols)

## Imputed and One-Hot Encoded Columns

In [134]:
# make list
imp_ohe_cols = ['facility_code']

# instantiate imputer
missing_imputer = SimpleImputer(strategy = 'constant', 
                               fill_value = 'missing',
                               add_indicator = True)

# instantiate ohe
ohe2 = OneHotEncoder(handle_unknown = 'ignore', 
                     sparse = False)

# make tuple
imp_ohe_tuple = (ohe2, imp_ohe_cols)

## Create Preprocessor Object

In [136]:
# create preprocessor
preprocessor = make_column_transformer(ohe_tuple,
                                      imp_ohe_tuple,
                                      remainder = 'drop')

# check
preprocessor

ColumnTransformer(transformers=[('onehotencoder-1',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['section_type', 'instructor_id', 'start_time',
                                  'subject_name', 'year', 'term',
                                  'class_length', 'course_difficulty',
                                  'days_per_week', 'schedule_days']),
                                ('onehotencoder-2',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['facility_code'])])

# Evaluation Functions

In [137]:
# create dataframe to save evaluated model scores
scores_df = pd.DataFrame()

# check
scores_df.shape

(0, 0)

In [None]:
# define function that will print classification metrics for current model
# and will store classification metrics into scores_df with all other scores
# to be called by 'scores_df'

def pipe_fit_and_evaluate_model(model, X_train, X_test, y_train, y_test, 
                                model_name):
    
    # make pipeline with preprocessor
    model_pipe = make_pipeline(preprocessor, model)
    
    # fit on training data
    model_pipe.fit(X_train, y_train)
    
    # create test data predictions
    test_pred = model_pipe.predict(X_test)
    
    # print model name
    print(f"{model_name} classification metrics on test data:")
    
    # print classification report
    print(classification_report(y_test, test_pred))
    
    # print roc auc
    print("roc auc score here")
    
    # display confusion matrix
    display(ConfusionMatrixDisply.from_predictions(y_test, 
                                                   test_pred, 
                                                   normalize = 'all'))
    
    # store in scores_df
    scores_df.at[model_name, 'Test Accuracy'] = 
    scores_df.at[model_name, 'Test Recall (Sensitivity)'] = 
    scores_df.at[model_name, 'Test Precision'] = 
    scores_df.at[model_name, 'Test Specificity'] = 
    scores_df.at[model_name, 'Test ROC AUC'] = 
    
    # classification metrics: 
    # accuracy
    # recall (sensitivity)
    # precision
    # specificity (recall on negative class)
    # classification report
    # confusion matrix 
    # roc auc 