In [102]:
# Import Libraries
import numpy as np
import pandas as pd

In [103]:
# Feature1: # of mins, Feature 2: expected goals, Label: actual goals
df = pd.read_csv('train_alternative_features_football_players_stats.csv')
print(df.head())

      Min  Ast   PK  Gls
0  2123.0  2.0  2.0  5.0
1    90.0  0.0  0.0  0.0
2   141.0  0.0  0.0  0.0
3  1833.0  5.0  0.0  1.0
4   212.0  0.0  0.0  1.0


In [104]:
# Prepare the training dataset
dataset = np.array(df)
print(dataset)

[[2.123e+03 2.000e+00 2.000e+00 5.000e+00]
 [9.000e+01 0.000e+00 0.000e+00 0.000e+00]
 [1.410e+02 0.000e+00 0.000e+00 0.000e+00]
 [1.833e+03 5.000e+00 0.000e+00 1.000e+00]
 [2.120e+02 0.000e+00 0.000e+00 1.000e+00]
 [8.110e+02 0.000e+00 0.000e+00 0.000e+00]
 [1.145e+03 1.000e+00 0.000e+00 4.000e+00]
 [2.396e+03 1.000e+00 0.000e+00 0.000e+00]
 [9.040e+02 0.000e+00 0.000e+00 0.000e+00]
 [3.062e+03 1.000e+00 0.000e+00 5.000e+00]
 [3.196e+03 2.000e+00 0.000e+00 2.000e+00]
 [4.830e+02 0.000e+00 0.000e+00 1.000e+00]
 [2.494e+03 0.000e+00 0.000e+00 0.000e+00]
 [2.597e+03 5.000e+00 0.000e+00 2.000e+00]
 [2.804e+03 5.000e+00 0.000e+00 5.000e+00]
 [1.645e+03 2.000e+00 0.000e+00 1.000e+00]
 [2.105e+03 3.000e+00 0.000e+00 1.000e+00]
 [1.920e+03 1.000e+00 0.000e+00 1.000e+00]
 [2.785e+03 4.000e+00 0.000e+00 7.000e+00]
 [1.912e+03 1.000e+00 0.000e+00 0.000e+00]
 [5.600e+02 0.000e+00 0.000e+00 1.000e+00]
 [1.434e+03 0.000e+00 0.000e+00 2.000e+00]
 [2.272e+03 2.000e+00 0.000e+00 2.000e+00]
 [6.890e+02

In [105]:
def linear_regression(dataset: np.array) -> np.array:

    beta = np.zeros(dataset.shape[1])


    X = np.array([x[:-1] for x in dataset])
    Y = np.array([y[-1] for y in dataset])
    intercept = np.ones((X.shape[0], 1))
    X = np.hstack((intercept, X))

    beta = np.linalg.inv(X.T @ X) @ X.T @ Y


    return beta

In [106]:
def predict(beta: np.array, x: np.array) -> float:

    prediction = x @ beta
    return prediction

In [107]:
# Beta of the training set
beta = linear_regression(dataset)

In [108]:
# Load the test dataset
df_test = pd.read_csv('test_alternative_features_football_players_stats.csv')
test_dataset = np.array(df_test)
print(test_dataset)

[[1.498e+03 5.000e+00 4.000e+00 1.100e+01]
 [5.360e+02 1.000e+00 0.000e+00 1.000e+00]
 [2.340e+03 0.000e+00 0.000e+00 0.000e+00]
 ...
 [2.220e+02 0.000e+00 0.000e+00 0.000e+00]
 [3.800e+01 0.000e+00 0.000e+00 0.000e+00]
 [2.915e+03 3.000e+00 4.000e+00 1.100e+01]]


In [109]:
# Prepare the test dataset
X_test = np.array([x[:-1] for x in test_dataset])
Y_test = np.array([y[-1] for y in test_dataset])
intercept = np.ones((X_test.shape[0], 1))
X_test = np.hstack((intercept, X_test))
print(X_test)

# Make predictions on the test dataset
predictions = np.array([predict(beta, x) for x in X_test])
print(predictions)

# Calculate evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(Y_test, predictions)
mse = mean_squared_error(Y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, predictions)

# Print the evaluation metrics
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

[[1.000e+00 1.498e+03 5.000e+00 4.000e+00]
 [1.000e+00 5.360e+02 1.000e+00 0.000e+00]
 [1.000e+00 2.340e+03 0.000e+00 0.000e+00]
 ...
 [1.000e+00 2.220e+02 0.000e+00 0.000e+00]
 [1.000e+00 3.800e+01 0.000e+00 0.000e+00]
 [1.000e+00 2.915e+03 3.000e+00 4.000e+00]]
[ 1.15900444e+01  9.13623016e-01  1.39554600e+00  5.86212956e+00
  2.10884305e+00 -1.21831355e-01  1.28612259e+00  8.23258681e+00
 -2.87546831e-01 -4.97811481e-02  1.98861210e+00  8.39308478e+00
  1.13256275e+00  9.41629699e-01  1.83586566e+00  4.12724804e+00
  4.17696268e+00 -2.82503316e-01  7.11069037e-01 -1.50750947e-03
  4.62495823e-01  3.57957357e+00 -3.10480943e-02  1.50083220e+00
  7.75967056e+00  1.91430749e+00  2.84745057e+00 -2.32788674e-01
  1.00744688e+01 -6.92347040e-02  1.42085647e+00  1.66006316e+00
 -1.05259807e-01  3.23438924e-01  1.72255443e-02  7.65920092e-01
 -4.68991398e-02  3.65948546e-01  2.95129577e+00  3.26255266e+00
 -3.10480943e-02  3.21869493e+00  8.27912326e-02  5.52558582e-01
  6.82248954e-01  1.2

In [110]:
print(predict(beta, np.array((1, 1782, 7, 2))))

10.074468791628345
