# 1. Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import model.model_setup as ms
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
pd.options.mode.chained_assignment = None  # default='warn'

student_data, pre_process = ms.create_student_data("../../data/High School East Student Data - Sheet1.csv")

print("pass")

pass


# 2. Feature Engineering Tests
I create a new copy of the student_data because the student_data will serve as a model for the many feature engineering
tests in this notebook.
## 2.1 Binning the Data
https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
### 2.1.1 Fixed-Width

In [3]:
import copy as cp

graph_binning_results = False

model_pipeline = Pipeline(steps=[('number_fix', ms.SumTransformer(transformation="fixed", bins=13)),
                             #    ('preprocess', pre_process),
                                 ('model', DecisionTreeRegressor(random_state=1))
                                 ])

data_copy = cp.deepcopy(student_data)

#comparing fixed width binning and dynamic width binning
data_copy["HS_AB_FIXED"] = np.array(np.floor(np.array(data_copy["AbsencesSum_HS"])))

scores_fixed = np.array(ms.run_test(data_copy, "HS_AB_FIXED", model_pipeline, features_=features))

ms.print_scores(scores_fixed)
scores = []

#set to True if you want to graph the binning results
if graph_binning_results:
    for i in range(1,100):
        pipe = Pipeline(steps=[('number_fix', ms.SumTransformer(bins=i)),
                                #     ('preprocess', pre_process),
                                     ('model', DecisionTreeRegressor(random_state=1))
                                     ])

        scores.append(np.mean(np.array(ms.run_test(data_copy, "HS_AB_FIXED", pipe, features_=features))))

    fig, ax = plt.subplots()
    ax.plot(range(1,100), scores)

    fig.savefig("test.png")
    plt.show()
    plt.clf()

Scores:  [17.33731809 17.19736842 14.80473373 12.24116424 12.71544715]
Score mean:  14.859206326363434
Score variances:  4.612170978352201


### 2.1.2 Log Binning

In [4]:
data_copy_log = cp.deepcopy(student_data)

log_binning_pipeline_test = Pipeline(steps=[('number_fix', ms.SumTransformer(transformation="log")),
                            #     ('preprocess', pre_process),
                                 ('model', DecisionTreeRegressor(random_state=1))
                                 ])


scores_log = np.array(ms.run_test(data_copy_log, "AbsencesSum_HS", log_binning_pipeline_test, features_=features))

print_scores(scores_log)

Scores:  [21.96923077 16.69230769 14.69230769 16.95384615 20.16666667]
Score mean:  18.094871794871796
Score variances:  6.829911900065748


## 2.2 Scaling the Data
### 2.2.1 Standard Scaling

In [5]:
scaler = StandardScaler()

data_copy_scale = cp.deepcopy(student_data)

# scales the columns by looking at the whole thing before cross valid
#for j in ms.column_list("A", 6, 9):
#   data_copy_scale[j] = StandardScaler().fit_transform(np.array(data_copy_scale[j]).reshape(-1,1))

scale_process = ColumnTransformer(remainder='drop',
                                transformers=[('Scaling', StandardScaler(), ["A6", "A7", "A8"])])

standard_scale_pipeline = Pipeline(steps=[('number_fix', ms.SumTransformer(bins=7)),
                                 ('preprocess', scale_process),
                                 ('model', LinearRegression())
                                 ])

scores_scale = np.array(ms.run_test(data_copy_scale, "AbsencesSum_HS", standard_scale_pipeline, features_=features))
print_scores(scores_scale)

Scores:  [14.1173029  14.12586942 14.7003947  15.98329941 15.24343408]
Score mean:  14.834060102961967
Score variances:  0.504295881871075


When adding the standard scaler into the pipeline, there is no difference with or without scaling the absence columns.
Because Decision Trees do not really care if a continuous feature is normally distributed, I changed it to Linear Regression.
Scaling the entire columns before the cross validation into five different parts actually led to decreased performance
and increased variation. It is probably because using StandardScaler is unnecessary since the columns are already
the same units. Perhaps scaling the columns blurred the association between the number of absences in each grade
and the total absences in high school. I'm not sure.

What I am sure about is that Linear Regression has great results and a great score variance!

### 2.2.2 Log Transformation

Log transformations are used when there is a specific column that has a much higher variance than the other columns.

In [6]:
for j in ms.column_list("A", 6, 9):
    print(data_copy_scale[j].var())
#data_copy_scale["A6"] = np.log(data_copy_scale["A6"])

TypeError: could not convert string to float: 'TRANSFER'

The variations are pretty much the same (maybe). If I do a log transformation, then I'd need to do it for all
the columns. Log transforming the absences don't do that much. It has been tested: see feature binning by log above.

## 2.3 Imputing the Data
Testing different imputing methods.

In [None]:
from sklearn.impute import SimpleImputer
scaler = StandardScaler()

imputed_data = cp.deepcopy(student_data)

# scales the columns by looking at the whole thing before cross valid
#for j in ms.column_list("A", 6, 9):
#   data_copy_scale[j] = StandardScaler().fit_transform(np.array(data_copy_scale[j]).reshape(-1,1))

impute_pre = ColumnTransformer(remainder='passthrough',
                                transformers=[('Impute', SimpleImputer(strategy='mean'), ["A6", "A7", "A8"])
                                              ])

impute_pipeline = Pipeline(steps=[('number_fix', ms.SumTransformer(bins=12,  new_value=None)),
                                 ('preprocess', impute_pre),
                                 ('model', LinearRegression())
                                 ])
"""
Strategy=Mean
Scores:  [11.26007034 12.92051632 13.45643293 16.3414301  12.37170819]
Score mean:  13.270031576344786
Score variances:  2.8874648678364245
"""


scores_impute = np.array(ms.run_test(imputed_data, "AbsencesSum_HS", impute_pipeline, features_=features))
print_scores(scores_impute)


In [None]:
"""
model_pipeline.fit(student_data[features], student_data["AbsencesSum_HS"])

importances = model_pipeline.named_steps['Decision Tree'].feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(7):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

scores = -1 * cross_val_score(ms.run_test(tree, "Decision Tree"),
                              student_data[features],
                              student_data["AbsencesSum_HS"],
                              cv=5,
                              scoring='neg_mean_absolute_error')

#print(np.mean(scores))
"""

Testing the optimal number of features.

In [None]:
model_pipeline = Pipeline(steps=[('number_fix', ms.SumTransformer()),
                                 ('pre_process', pre_process),
                                 ('Decision Tree', tree)
                                 ])


print("hell")
%matplotlib inline

#RVEFC for determining optimal number of features??