# Miscellaneous Work

# Ideas to add

1. Further cleaning:
    - make days of the week variables into one categorical column with values 
    - like MW, TR, MWF, etc.
    - get back number of students (= grades) per class
    - combine course and subject names into departments


2. Plot ideas
    - are course difficulties split evenly between fall and spring semesters? #amro's idea
    - good way to visualize 'instructor_id', 'course_name', 'subject_name', 'facility_code'?
        - 'facility_code' = map of campus with key for facility_code numbers?
        - subject_name = would probably have to group into departments, then could visualize with circle sizes; same with course_name

# Parallel Work

In [24]:
# import libraries

import warnings
warnings.filterwarnings('ignore')

# general
import numpy as np
import pandas as pd

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

# modeling
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, \
GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import r2_score, mean_absolute_error, \
mean_squared_error, mean_absolute_percentage_error

# tuning
from sklearn.model_selection import GridSearchCV

# feature engineering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import set_config
set_config(display="text")
from kneed import KneeLocator

In [25]:
# load data
path = 'Data/all_grades_data_cleaned.csv'
df = pd.read_csv(path, index_col = 0)

In [26]:
df_fe = df.copy()

## 'num_all_grades'

In [28]:
# see description of column
df_fe['num_all_grades'].describe()

count    49975.000000
mean        37.047684
std         52.705481
min          1.000000
25%         13.000000
50%         20.000000
75%         35.000000
max        712.000000
Name: num_all_grades, dtype: float64

In [29]:
# small = under 15
# medium = 16-35
# large = over 35

# bin 
for index in df_fe['num_all_grades'].index:
    if df_fe.at[index, 'num_all_grades'] < 16:
        df_fe.at[index, 'num_all_grades'] = 'small'
    elif df_fe.at[index, 'num_all_grades'] < 36:
        df_fe.at[index, 'num_all_grades'] = 'medium'
    else:
        df_fe.at[index, 'num_all_grades'] = 'large'
      
# check
df_fe['num_all_grades'].value_counts()

medium    20701
small     17024
large     12250
Name: num_all_grades, dtype: int64

# Correlations

In [None]:
## Correlation heatmap

# to visualize correlations in a heatmap with categorical variables, they need
# to be converted to numbers; will use get_dummies to convert categorical
# columns to dummy columns that are binary encoded

# make a copy of df
df_corr = df.copy()

In [None]:
df.columns

In [None]:
# can binary encode days of week and weekend without using get_dummies
days_cols = ['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend']

for col in days_cols:
  df_corr[col].replace({True: 1, False: 0, 'True': 1, 'False': 0}, 
                       inplace = True)
  # check
  print(df_corr[col].value_counts())

In [None]:
cat_cols = ['section_type', 'instructor_id', 'facility_code','subject_name', 
            'course_name', 'year', 'term', 'course_difficulty']

for col in cat_cols:

  # use get_dummies to create dummy variables for columns
  # example: dummy_cols = pd.get_dummies(df['col'], prefix = 'category')
  dummy_cols = pd.get_dummies(df_corr[col], prefix = str(col))

  # concatenate the dummy variables with the original df
  # example: df = pd.concat([df, dummy_cols], axis = 1)
  df_corr = pd.concat([df_corr, dummy_cols], axis = 1)

  # drop original categorical variable
  # example: df.drop(columns = 'col', inplace = True)
  df_corr.drop(columns = col, inplace = True)

# check
df_corr.info()

In [199]:
# make the correlations
corr = df_corr.corr()

In [200]:
# check
corr

Unnamed: 0,start_time,mon,tues,wed,thurs,fri,a_proportion,f_proportion,avg_grade,class_length,...,year_14,year_15,year_16,year_17,year_18,term_fall,term_spring,course_difficulty_advanced,course_difficulty_elementary,course_difficulty_intermediate
start_time,1.000000,0.158021,0.159183,0.162163,0.147013,0.001553,-0.140318,0.022375,-0.101099,0.426577,...,0.003184,0.000212,-0.001043,-0.000297,-0.002680,0.020453,-0.020453,-0.134619,0.058485,0.056868
mon,0.158021,1.000000,-0.499735,0.771070,-0.465654,0.532388,-0.217519,0.059189,-0.207958,-0.115468,...,-0.007911,-0.012212,-0.012997,-0.017070,-0.009197,0.028300,-0.028300,-0.187294,0.231860,-0.072834
tues,0.159183,-0.499735,1.000000,-0.550931,0.829200,-0.322488,-0.123339,0.011615,-0.098378,0.092673,...,-0.000573,-0.002351,-0.009695,-0.012293,-0.010329,0.012340,-0.012340,-0.079132,0.050282,0.017370
wed,0.162163,0.771070,-0.550931,1.000000,-0.518327,0.464109,-0.168055,0.054870,-0.165456,-0.089760,...,-0.006301,-0.012136,-0.007551,-0.006927,-0.002904,0.019367,-0.019367,-0.160753,0.182533,-0.045882
thurs,0.147013,-0.465654,0.829200,-0.518327,1.000000,-0.308618,-0.148295,0.019003,-0.123148,0.070654,...,-0.000807,-0.000278,-0.010577,-0.014155,-0.016916,0.007909,-0.007909,-0.091602,0.066410,0.011824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
term_fall,0.020453,0.028300,0.012340,0.019367,0.007909,0.028962,-0.038409,0.001864,-0.022418,-0.002830,...,-0.026459,-0.024630,-0.026089,-0.030456,0.191775,1.000000,-1.000000,-0.034414,0.048725,-0.019564
term_spring,-0.020453,-0.028300,-0.012340,-0.019367,-0.007909,-0.028962,0.038409,-0.001864,0.022418,0.002830,...,0.026459,0.024630,0.026089,0.030456,-0.191775,-1.000000,1.000000,0.034414,-0.048725,0.019564
course_difficulty_advanced,-0.134619,-0.187294,-0.079132,-0.160753,-0.091602,-0.133932,0.204297,-0.069988,0.202528,0.071621,...,0.002118,-0.000280,0.003900,0.002816,-0.006320,-0.034414,0.034414,1.000000,-0.437640,-0.419209
course_difficulty_elementary,0.058485,0.231860,0.050282,0.182533,0.066410,0.211775,-0.145207,0.083230,-0.153810,-0.156272,...,-0.008151,-0.010224,-0.019835,-0.015036,0.008248,0.048725,-0.048725,-0.437640,1.000000,-0.632867


In [201]:
# too many columns to visualize in heatmap; will look at coefficients in tables instead
corr_coefficients = corr.unstack().sort_values().drop_duplicates()

print(type(corr_coefficients))

# most strongly correlated variables (negative)
display(corr_coefficients.head())

# most strongly correlated variables (positive)
display(corr_coefficients.tail())

<class 'pandas.core.series.Series'>


term_spring                     term_fall                      -1.000000
section_type_lab                section_type_lec               -0.650262
course_difficulty_intermediate  course_difficulty_elementary   -0.632867
section_type_ind                start_time                     -0.554149
wed                             tues                           -0.550931
dtype: float64

instructor_id_309711.0                                              course_name_nutrition today                                0.984941
course_name_workshop in educational leadership and policy analysis  subject_name_educational leadership and policy analysis    0.985174
instructor_id_464620.0                                              course_name_business law                                   0.986384
facility_code_0129                                                  subject_name_naval science                                 0.995814
course_difficulty_elementary                                        course_difficulty_elementary                               1.000000
dtype: float64

In [202]:
corr_df = corr_coefficients.to_frame()
corr_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 527506 entries, ('term_spring', 'term_fall') to ('course_difficulty_elementary', 'course_difficulty_elementary')
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       527506 non-null  float64
dtypes: float64(1)
memory usage: 6.1+ MB


In [203]:
corr_df.head()

Unnamed: 0,Unnamed: 1,0
term_spring,term_fall,-1.0
section_type_lab,section_type_lec,-0.650262
course_difficulty_intermediate,course_difficulty_elementary,-0.632867
section_type_ind,start_time,-0.554149
wed,tues,-0.550931


In [204]:
corr_df.rename(columns = {0: 'coeff'}, inplace = True)

In [205]:
# positive strong correlations
strong_pos = corr_df[corr_df['coeff'] >= .7]

with pd.option_context('display.max_rows', None):
    print(strong_pos)

                                                                                                          coeff
course_name_nuclear reactor laboratory             instructor_id_599888.0                              0.700496
course_name_elementary latin                       subject_name_latin (classics)                       0.700620
subject_name_community and environmental sociology instructor_id_2601467.0                             0.701736
course_name_human resources development: educat... subject_name_counseling psychology                  0.704337
subject_name_biological systems engineering        facility_code_0080                                  0.705713
course_name_bank management                        instructor_id_747510.0                              0.707842
instructor_id_190420.0                             course_name_issues in psychology                    0.709151
course_name_introduction to scientific communic... instructor_id_280437.0                              0

In [206]:
# positive moderate correlations
mod_pos = corr_df[ (corr_df['coeff'] >= .5) & (corr_df['coeff'] < .7) ]

with pd.option_context('display.max_rows', None):
    print(mod_pos)

                                                                                                          coeff
instructor_id_4197612.0                            course_name_transitions: practice, professional...  0.500293
course_name_introduction to statistical methods    facility_code_0451a                                 0.501300
instructor_id_1600979.0                            course_name_survey of early hispanic literature     0.502305
facility_code_0557                                 course_name_crc first-year seminar: foundations...  0.502439
subject_name_physical therapy                      course_name_physical therapy educcation and lif...  0.502532
instructor_id_590681.0                             course_name_human resources development: career...  0.502685
course_name_second semester hebrew                 instructor_id_1062990.0                             0.503713
instructor_id_3370746.0                            subject_name_occupational therapy (department o...  0

In [207]:
# positive low correlations
low_pos = corr_df[ (corr_df['coeff'] >= .3) & (corr_df['coeff'] < .5) ]

with pd.option_context('display.max_rows', None):
    print(low_pos)

                                                                                                          coeff
subject_name_mechanical engineering                course_name_engineering design projects             0.300193
facility_code_0028                                 instructor_id_716313.0                              0.300207
instructor_id_2601788.0                            facility_code_0118                                  0.301437
facility_code_0450                                 course_name_physical therapy educcation and lif...  0.301460
instructor_id_4942976.0                            course_name_managing organizations                  0.301866
subject_name_curriculum and instruction            course_name_general seminar                         0.301946
course_name_special topics in mass communication   instructor_id_2601747.0                             0.302277
facility_code_1480                                 instructor_id_585086.0                              0

In [208]:
# negative low to strong correlations
neg = corr_df[corr_df['coeff'] <= -.3]

with pd.option_context('display.max_rows', None):
    print(neg)

                                                                coeff
term_spring                    term_fall                    -1.000000
section_type_lab               section_type_lec             -0.650262
course_difficulty_intermediate course_difficulty_elementary -0.632867
section_type_ind               start_time                   -0.554149
wed                            tues                         -0.550931
                               thurs                        -0.518327
tues                           mon                          -0.499735
section_type_sem               section_type_lec             -0.466160
thurs                          mon                          -0.465654
avg_grade                      f_proportion                 -0.459244
course_difficulty_elementary   course_difficulty_advanced   -0.437640
course_difficulty_intermediate course_difficulty_advanced   -0.419209
section_type_ind               section_type_lec             -0.392251
section_type_lec    

In [209]:
## Multivariate exploratory visualizations

# not histograms, boxplots, or correlation plots

df.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time', 'mon',
       'tues', 'wed', 'thurs', 'fri', 'subject_name', 'course_name',
       'a_proportion', 'f_proportion', 'avg_grade', 'year', 'term',
       'class_length', 'total_time', 'weekend', 'course_difficulty'],
      dtype='object')