In [1]:
import numpy
import pandas
from sklearn.model_selection import KFold
numpy.set_printoptions(precision=3, suppress=True)

In [2]:
# Read the data
train_data = pandas.read_csv("Features_Variant_1.csv", header=None)
train_data.head()
X = train_data.iloc[:, :-1]
Y = train_data.iloc[:, -1]

In [3]:
# Calculate a few useful properties
X_mean = X.mean()
X_std = X.std()

# Avoid zero division
X_std.loc[X_std == 0] = 1

# Make feature values less deviating to avoid model instability
X = (X - X_mean) / X_std

In [4]:
# Fantom features that replace the non-feature-bound constant
X.insert(loc=0, value=1, column=-1)
# Tip: if you use PyCharm, you can Right Click + View as Data Frame
# in the Jupyter window to interact with this data

In [5]:
# I mean, I know this changes semantics, but screw that, I want a clean code
X = X.values
Y = Y.values

In [12]:
# Change these if you need to
number_of_splits = 5
number_of_epochs = 20000
etha = 0.05

In [13]:
def mean_square_error(Y, previous_Y):
    return numpy.sum(numpy.square(Y - previous_Y)) / len(Y)

def sqrt_mean_square_error(Y, previous_Y):
    return numpy.sqrt(mean_square_error(Y, previous_Y))

# Also known as R^2
def coefficient_of_determination(Y, previous_Y):
    return 1 - (numpy.sum(numpy.square(Y - previous_Y))) / (numpy.sum(numpy.square(Y - Y.mean())))

def gradient_descent(X, Y, w):
    delta_w = numpy.inf

    iteration_index = 0
    mean_square_error_values = list()

    while delta_w > 1e-4 and iteration_index < number_of_epochs:
        w_next = w + (2 * etha / len(Y)) * numpy.dot(X.T, Y - (numpy.dot(X, w)))
        previous_Y = numpy.dot(X, w_next)

        iteration_index += 1

        mean_square_error_values.append(mean_square_error(Y, previous_Y))
        if iteration_index % 500 == 0:
          print(f"Epoch {iteration_index} done.")

        delta_w = numpy.linalg.norm(w - w_next)
        w = w_next

    return w, mean_square_error_values

In [None]:
errors = [[] for _ in range(number_of_splits)]
weights = [[] for _ in range(number_of_splits)]
rmse_train = [0 for _ in range(number_of_splits)]
rmse_test = [0 for _ in range(number_of_splits)]
r2s_train = [0 for _ in range(number_of_splits)]
r2s_test = [0 for _ in range(number_of_splits)]

kf = KFold(n_splits=number_of_splits, shuffle=True)
kf.get_n_splits(X)

for split_option_index, (train_indices, test_indices) in enumerate(kf.split(X)):
  print(f"Running split option {split_option_index}.")
  X_train, X_test = X[train_indices], X[test_indices]
  Y_train, Y_test = Y[train_indices], Y[test_indices]

  # I could randomize here, of course, but I prefer to have a reproducible results
  weights[split_option_index], errors[split_option_index] = gradient_descent(X_train, Y_train, numpy.zeros(X.shape[1]))

  train_preds = numpy.dot(X_train, weights[split_option_index])
  test_preds = numpy.dot(X, X_test, weights[split_option_index])

  rmse_train[split_option_index] = sqrt_mean_square_error(Y_train, train_preds)
  rmse_test[split_option_index] = sqrt_mean_square_error(Y_test, test_preds)

  r2s_train[split_option_index] = coefficient_of_determination(Y_train, train_preds)
  r2s_test[split_option_index] = coefficient_of_determination(Y_test, test_preds)

Running split option 0.
Epoch 500 done.
Epoch 1000 done.
Epoch 1500 done.
Epoch 2000 done.
Epoch 2500 done.
Epoch 3000 done.
Epoch 3500 done.
Epoch 4000 done.


In [None]:
%pylab inline
for error_by_iter in errors:
  plot(range(len(error_by_iter)), error_by_iter)
  xlabel('Iteration')
  ylabel('mean_square_error')

In [None]:
columns = ["Names"]
columns += [f"T{i + 1}" for i in range(number_of_splits)]
columns.append("Mean")
columns.append("Std")

rows =  ["sqrt_mean_square_error train", "sqrt_mean_square_error test", "coefficient_of_determination train", "coefficient_of_determination test"]
rows += [f"Feature {i + 1}" for i in range(len(weights[0]))]

In [None]:
result_dataframe = pandas.DataFrame(columns=columns)
result_dataframe["Names"] = rows
result_dataframe.set_index("Names", inplace=True)

for i in range(number_of_splits):
  data = numpy.concatenate(([rmse_train[i], rmse_test[i], r2s_train[i], r2s_test[i]], weights[i]))
  result_dataframe[f"T{i + 1}"] = data

result_dataframe["Mean"] = result_dataframe.mean(axis=1)
result_dataframe["Std"] = result_dataframe.std(axis=1)

In [None]:
result_dataframe