In [None]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from fancyimpute import KNN, SimpleFill
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_curve, roc_auc_score, auc
import seaborn as sb


In [None]:
random.seed(136)

In [None]:
df = pd.read_csv('./dataset_predictions_complete.csv')

In [None]:
# remove high values
df[df.iloc[:, 1:-2] > 1] = np.nan

In [None]:
df.shape

In [None]:
C_mat = df.iloc[:, 1:-1].corr()
fig = plt.figure(figsize = (15,15))

sb.heatmap(C_mat, vmax = 1.0, square = True, cmap='jet')
plt.show()

### imputation

In [None]:
imputation_types = 'knn' # 'knn', 'mean', 'median'
if imputation_types == 'mean':
    df_filled = SimpleFill().fit_transform(df.iloc[:, 1:-2])
    df.iloc[:, 1:-2] = df_filled
elif imputation_types == 'knn':
    df_filled = KNN().fit_transform(df.iloc[:, 1:-2])
    df.iloc[:, 1:-2] = df_filled

### Train test split

In [None]:
train, test = train_test_split(df)

In [None]:
# get predictors and target
train_x = np.array(train.iloc[:, 1:-2])
train_y = np.array(train.iloc[:, -2])
print("Number of training samples: {}".format(train_x.shape[0]))

In [None]:
train_x.shape

In [None]:
# get predictors and target
test_x = np.array(test.iloc[:, 1:-2])
test_y = np.array(test.iloc[:, -2])
print("Number of testing samples: {}".format(test_x.shape[0]))

### Naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
clf = gnb.fit(train_x, train_y)
train_y_predicted = clf.predict(train_x)
test_y_predicted = clf.predict(test_x)

In [None]:
mean_squared_error(train_y, train_y_predicted)

In [None]:
mean_squared_error(test_y, test_y_predicted)

### linear regression - train - test splits

In [None]:
# from sklearn.feature_selection import chi2, f_regression

In [None]:
# # Create linear regression object
# regr = linear_model.LinearRegression()

In [None]:
# # Train the model using the training sets
# regr.fit(train_x, train_y)

In [None]:
# # Make predictions using the testing set
# pred_train = regr.predict(train_x)

# # The coefficients
# print('Coefficients: \n', regr.coef_)

# # The mean squared error
# print("Mean squared error: %.2f" % mean_squared_error(train_y, pred_train))

# # Explained variance score: 1 is perfect prediction
# print('Variance score: %.2f' % r2_score(train_y, pred_train))

In [None]:
# # Make predictions using the testing set
# pred_test = regr.predict(test_x)

# # The coefficients
# print('Coefficients: \n', regr.coef_)

# # The mean squared error
# print("Mean squared error: %.2f" % mean_squared_error(test_y, pred_test))

# # Explained variance score: 1 is perfect prediction
# print('Variance score: %.2f' % r2_score(test_y, pred_test))

In [None]:
# F, pval = f_regression(train_x, train_y)

# quick neural net in keras

In [None]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import os
import keras
import matplotlib.pyplot as plt
from keras.callbacks import ReduceLROnPlateau
import math
import keras.backend as K

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(128, input_dim=train_x.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(256, input_dim=train_x.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(256, input_dim=train_x.shape[1], kernel_initializer='normal', activation='relu'))
#     model.add(Dense(25, input_dim=train_x.shape[1], kernel_initializer='normal', activation='relu'))
#     model.add(Dense(15, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    adam = keras.optimizers.adam(lr=0.05, decay=0.0)
    model.compile(loss='mean_absolute_error', optimizer=adam, metrics=['mae', 'mse'])
    return model

In [None]:
def step_decay(epoch):
    initial_lrate = 0.01
    drop = 0.5
    epochs_drop = 500.0
    lrate = initial_lrate * math.pow(drop,  
           math.floor((1+epoch)/epochs_drop))
    print(lrate)
    return lrate

In [None]:
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.lr = []
 
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        # self.lr.append(step_decay(len(self.losses)))

In [None]:
class MyCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        lr = self.model.optimizer.lr
        decay = self.model.optimizer.decay
        iterations = self.model.optimizer.iterations
        lr_with_decay = lr / (1. + decay * K.cast(iterations, K.dtype(decay)))
        print(K.eval(lr_with_decay))

In [None]:
model = baseline_model()

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=100)
lh = LossHistory()
pl = MyCallback()
lrate = keras.callbacks.LearningRateScheduler(step_decay)

In [None]:
train_x.shape

In [None]:
history = model.fit(x=train_x, 
                 y=train_y, 
                 validation_data=[test_x, test_y],
                 epochs=1000,
                 verbose=True)

In [None]:
logs = history.history

In [None]:
plt.plot(logs['loss'])
plt.plot(logs['val_loss'])
plt.ylim([0, 1e3])
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
model.evaluate(train_x, train_y)

In [None]:
model.evaluate(test_x, test_y)

In [None]:
pred_test_y = model.predict_on_batch(test_x)
pred_train_y = model.predict_on_batch(train_x)

In [None]:
print(np.mean(np.abs(pred_train_y - train_x)))
print(np.mean(np.abs(pred_test_y - test_y)))

In [None]:
train_x.shape

In [None]:
plt.scatter(train_y, pred_train_y)
plt.scatter(train_y, train_y, marker='x', c='r')
plt.ylabel('prediction')
plt.xlabel('ground truth')
plt.show()

In [None]:
plt.scatter(test_y, pred_test_y)
plt.scatter(test_y, test_y, marker='x', c='r')
plt.ylabel('prediction')
plt.xlabel('ground truth')
plt.show()

In [None]:
test_y