In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, KBinsDiscretizer, StandardScaler
from sklearn.tree import DecisionTreeRegressor
pd.options.mode.chained_assignment = None  # default='warn'


def print_scores(score_array):
    print("Scores: ", score_array)
    print("Score mean: ", np.mean(score_array))
    print("Score variances: ", np.var(score_array))

# easy way of accessing A_6, A_7, ... A_N columns
def column_list(letter, start, end):
    return ["%s%d" % (letter, i) for i in range(start, end)]


def remove_outliers(column, target, first, second):
    non_outliers = target.between(target.quantile(first), target.quantile(second))
    count = 0

    for index in range(0, len(column)):
        if ~non_outliers[index]:
            count += 1
            column.drop(index, inplace=True)

    print("%i outliers were removed" % count)

# convert strings to int type even if it's a float
# replace by median or mean?
def convert_stat(x, new_value=0):
    if not isinstance(x, int):

        if not isinstance(x, float) and '.' not in x:
            return new_value if x == "TRANSFER" else int(x)
        else:
            return new_value if x == "TRANSFER" else int(float(x))

    else:
        return x

def transform_value(x, imputer=None, bins=1, new_value=0):
    value = convert_stat(x, new_value=new_value)

    if imputer is None:
        return np.floor(value / float(bins))
    else:
        return value


class SumTransformer(BaseEstimator):

    # set new_value to None if Pipeline contains SimpleImputer
    # this is for absences and tardies since somet students are
    # transfer students. The placeholder in the CSV is the string "TRANSFER"
    def __init__(self, new_value=0, bins=1, transformation="fixed", imputer=None):
        self.new_value = new_value
        self.transformation = transformation
        self.bins = bins
        # SimpleImputer object
        self.imputer = imputer

        if self.new_value is None and self.imputer is None:
            raise ValueError("New value has been set to None but imputer argument is also None.")

    def fit(self, x, y=None):
        return self

    def transform(self, df):
        # change to for i in ["A", "T"] to include tardies if need be
        for i in ["A"]:

            # corrects the values in the data frame that will be used in the training models
            for j in column_list(i, 6, 9):
                # if no function is provided, the stats will be converted regularly
                # where it can be divided into fixed-width bins
                # though unnecessary, this is to make testing new things easier
                if self.transformation == "fixed":
                    df[j] = df[j].apply(transform_value, args=(self.imputer, self.bins, self.new_value))

                elif self.transformation == "log":
                    df[j] = df[j].apply(lambda x: np.log((1 + convert_stat(x, new_value=self.new_value))))

                else:
                    raise Exception("Transformation argument was not correctly assigned.")

        if not isinstance(df, pd.DataFrame):
            raise ValueError("Hold up a minute.")

        return df


def run_test(data, target, pipeline, features_=["A6", "A7", "A8"]):

    scores_ = -1 * cross_val_score(pipeline,
                              data[features_],
                              data[target],
                              cv=5,
                              scoring='neg_mean_absolute_error')

    return scores_

print("pass")

pass


After setting up the functions needed for pre-processing, I set up the data. There are necessary pre-processing
*before* the actual pre-processing. Because some numerical columns contain string types, it is necessary
to convert them. Also, outliers are determined before the split (or pipeline), and removed.

In [2]:
student_data = pd.read_csv("../data/High School East Student Data - Sheet1.csv")
features = ["A6", "A7", "A8"]
student_data["AbsencesSum_HS"] = 0

# Pipeline doesn't allow transformations on the target label
# so I have to do transformations outside of Pipeline in order
# to sum all absences in High School for each student.
for j in column_list("A", 9, 13):
    student_data[j] = student_data[j].apply(convert_stat)


student_data["AbsencesSum_HS"] = student_data[column_list('A', 9, 13)].sum(axis=1)

# because we've created the total absences in high school column
# we are now able to eliminate outliers in the dataset.
remove_outliers(student_data, student_data["AbsencesSum_HS"], 0, 0.95)

pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('categories', OneHotEncoder(), ["Gender", "IEP/Specialized"])])

model_pipeline = Pipeline(steps=[('number_fix', SumTransformer()),
                                 ('model', DecisionTreeRegressor(random_state=1))
                                 ])

#######################################
# Sorta like unit testing but in jupyter
test = run_test(student_data, "AbsencesSum_HS", model_pipeline)
prior_run = np.array([23.43076923, 16.23076923, 15.76923077, 17.93846154, 19.58333333])

if not np.allclose(test, prior_run, atol=0.001):
    raise Exception("Modification to pre-processing led to unintended results.")
#######################################

print("pass")

4 outliers were removed
pass


# 2. Feature Engineering Tests
I create a new copy of the student_data because the student_data will serve as a model for the many feature engineering
tests in this notebook.
## 2.1 Binning the Data
https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
### 2.1.1 Fixed-Width

In [3]:
import copy as cp

model_pipeline = Pipeline(steps=[('number_fix', SumTransformer(transformation="fixed", bins=13)),
                             #    ('preprocess', pre_process),
                                 ('model', DecisionTreeRegressor(random_state=1))
                                 ])

data_copy = cp.deepcopy(student_data)

#comparing fixed width binning and dynamic width binning
data_copy["HS_AB_FIXED"] = np.array(np.floor(np.array(data_copy["AbsencesSum_HS"])))

scores_fixed = np.array(run_test(data_copy, "HS_AB_FIXED", model_pipeline, features_=features))

print_scores(scores_fixed)
scores = []

#set to True if you want to graph the binning results
if False:
    for i in range(1,100):
        pipe = Pipeline(steps=[('number_fix', SumTransformer(bins=i)),
                                #     ('preprocess', pre_process),
                                     ('model', DecisionTreeRegressor(random_state=1))
                                     ])

        scores.append(np.mean(np.array(run_test(data_copy, "HS_AB_FIXED", pipe, features_=features))))

    fig, ax = plt.subplots()
    ax.plot(range(1,100), scores)

    fig.savefig("test.png")
    plt.show()
    plt.clf()

Scores:  [17.33731809 17.19736842 14.80473373 12.24116424 12.71544715]
Score mean:  14.859206326363434
Score variances:  4.612170978352201


### 2.1.2 Log Binning

In [4]:
data_copy_log = cp.deepcopy(student_data)

log_binning_pipeline_test = Pipeline(steps=[('number_fix', SumTransformer(transformation="log")),
                            #     ('preprocess', pre_process),
                                 ('model', DecisionTreeRegressor(random_state=1))
                                 ])


scores_log = np.array(run_test(data_copy_log, "AbsencesSum_HS", log_binning_pipeline_test, features_=features))

print_scores(scores_log)

Scores:  [21.96923077 16.69230769 14.69230769 16.95384615 20.16666667]
Score mean:  18.094871794871796
Score variances:  6.829911900065748


## 2.2 Scaling the Data
### 2.2.1 Standard Scaling

In [5]:
scaler = StandardScaler()

data_copy_scale = cp.deepcopy(student_data)

# scales the columns by looking at the whole thing before cross valid
#for j in column_list("A", 6, 9):
#   data_copy_scale[j] = StandardScaler().fit_transform(np.array(data_copy_scale[j]).reshape(-1,1))

scale_process = ColumnTransformer(remainder='drop',
                                transformers=[('Scaling', StandardScaler(), ["A6", "A7", "A8"])])

standard_scale_pipeline = Pipeline(steps=[('number_fix', SumTransformer(bins=7)),
                                 ('preprocess', scale_process),
                                 ('model', LinearRegression())
                                 ])

scores_scale = np.array(run_test(data_copy_scale, "AbsencesSum_HS", standard_scale_pipeline, features_=features))
print_scores(scores_scale)

Scores:  [14.1173029  14.12586942 14.7003947  15.98329941 15.24343408]
Score mean:  14.834060102961967
Score variances:  0.504295881871075


When adding the standard scaler into the pipeline, there is no difference with or without scaling the absence columns.
Because Decision Trees do not really care if a continuous feature is normally distributed, I changed it to Linear Regression.
Scaling the entire columns before the cross validation into five different parts actually led to decreased performance
and increased variation. It is probably because using StandardScaler is unnecessary since the columns are already
the same units. Perhaps scaling the columns blurred the association between the number of absences in each grade
and the total absences in high school. I'm not sure.

What I am sure about is that Linear Regression has great results and a great score variance!

### 2.2.2 Log Transformation

Log transformations are used when there is a specific column that has a much higher variance than the other columns.

In [6]:
for j in column_list("A", 6, 9):
    print(data_copy_scale[j].var())
#data_copy_scale["A6"] = np.log(data_copy_scale["A6"])

TypeError: could not convert string to float: 'TRANSFER'

The variations are pretty much the same (maybe). If I do a log transformation, then I'd need to do it for all
the columns. Log transforming the absences don't do that much. It has been tested: see feature binning by log above.

## 2.3 Imputing the Data
Testing different imputing methods.

In [None]:
from sklearn.impute import SimpleImputer
scaler = StandardScaler()

imputed_data = cp.deepcopy(student_data)

# scales the columns by looking at the whole thing before cross valid
#for j in column_list("A", 6, 9):
#   data_copy_scale[j] = StandardScaler().fit_transform(np.array(data_copy_scale[j]).reshape(-1,1))

impute_pre = ColumnTransformer(remainder='passthrough',
                                transformers=[('Impute', SimpleImputer(strategy='mean'), ["A6", "A7", "A8"])
                                              ])

impute_pipeline = Pipeline(steps=[('number_fix', SumTransformer(bins=12,  new_value=None)),
                                 ('preprocess', impute_pre),
                                 ('model', LinearRegression())
                                 ])
"""
Strategy=Mean
Scores:  [11.26007034 12.92051632 13.45643293 16.3414301  12.37170819]
Score mean:  13.270031576344786
Score variances:  2.8874648678364245
"""


scores_impute = np.array(run_test(imputed_data, "AbsencesSum_HS", impute_pipeline, features_=features))
print_scores(scores_impute)


In [None]:
"""
model_pipeline.fit(student_data[features], student_data["AbsencesSum_HS"])

importances = model_pipeline.named_steps['Decision Tree'].feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(7):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

scores = -1 * cross_val_score(run_test(tree, "Decision Tree"),
                              student_data[features],
                              student_data["AbsencesSum_HS"],
                              cv=5,
                              scoring='neg_mean_absolute_error')

#print(np.mean(scores))
"""

Testing the optimal number of features.

In [None]:
model_pipeline = Pipeline(steps=[('number_fix', SumTransformer()),
                                 ('pre_process', pre_process),
                                 ('Decision Tree', tree)
                                 ])


print("hell")
%matplotlib inline

#RVEFC for determining optimal number of features??