# TI3145TU Midterm Assignment 
## Football Players Wages

We hope you enjoy this assignment, good luck!

Student names: XXX

Student numbers: XXX

### Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

import matplotlib.pyplot as plt

### Load data

In [2]:
# These are your training samples along with their labels
data = pd.read_csv('football_wages.csv')
data.head()

# You need to extract the features and the regression target. The regression target is 'log_wages'. 

Unnamed: 0,age,height_cm,weight_kg,nationality_name,overall,potential,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,...,movement_reactions,movement_balance,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,log_wages
0,27.0,183.0,76.0,b'Korea Republic',57.0,58.0,54.0,30.0,55.0,53.0,...,60.0,67.0,63.0,58.0,9.0,13.0,8.0,11.0,10.0,3.0
1,21.0,182.0,70.0,b'France',61.0,72.0,58.0,63.0,46.0,62.0,...,47.0,65.0,31.0,33.0,9.0,11.0,9.0,12.0,11.0,3.0
2,35.0,182.0,75.0,b'Korea Republic',68.0,68.0,62.0,68.0,68.0,70.0,...,61.0,69.0,36.0,40.0,8.0,12.0,7.0,12.0,6.0,3.30103
3,29.0,169.0,70.0,b'Paraguay',67.0,67.0,62.0,55.0,50.0,71.0,...,59.0,84.0,40.0,55.0,6.0,10.0,11.0,15.0,9.0,2.69897
4,30.0,176.0,74.0,b'Austria',65.0,65.0,63.0,49.0,53.0,63.0,...,58.0,75.0,65.0,64.0,12.0,15.0,10.0,8.0,10.0,3.477121


In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer, Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def MAE( pred, true ):
    return np.mean( np.abs( pred - true ) )

random_state_seed = 42

X = data.drop(columns=['log_wages'])
y = data['log_wages']

#print(f"X: {X.shape}")
#print(f"y: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_state_seed )

baseline_MAE_train = MAE( y_train, y_train.mean() )
baseline_MAE_test = MAE( y_test, y_train.mean() )

print( f"If our regressor simply assigned the mean of our data to samples our MAE would be:" )
print( f"Train: {baseline_MAE_train}" )
print( f"Test: {baseline_MAE_test}" )

If our regressor simply assigned the mean of our data to samples our MAE would be:
Train: 0.4912703291303276
Test: 0.49086004658113885


In [4]:
knregressor = KNeighborsRegressor()
sgdregressor = SGDRegressor( random_state=random_state_seed )

pipelines = {
    "SGD": [None, None],
    "KN": [None, None]
}

### Pipeline 1

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier


X = data.drop("log_wages", axis=1)
y = data["log_wages"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# beginning of the pipeline
# categorical columns
cat_cols = X.select_dtypes("object").columns

# numerical columns
num_cols = X.select_dtypes(exclude=["object"]).columns


# impute the missing values with the most frequent ones
# for categorical values use one hot encoding
# handle_unknown => if the encoder sees a new category during transformation, leave 0 for all values for that item
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
cat_pipeline = Pipeline([("cat_imputation", imp), ("cat_encoding", one_hot_encoder)])


# imput the missing values with the mean values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
standard_scaler = StandardScaler()
num_pipeline = Pipeline([("num_imputation", imp), ("num_scaling", standard_scaler)])

preprocess = ColumnTransformer([
    ("cat_pipeline", cat_pipeline, cat_cols), ("num_pipeline", num_pipeline, num_cols)
    ])

preprocess

pipeline_1 = Pipeline([("preprocess", preprocess), ("model", KNeighborsClassifier())
                       ])

pipeline_1



### Pipeline 2

In [5]:
def get_preprocessed_pipeline( data, model ):
    oh_encoder = OneHotEncoder( handle_unknown="ignore", sparse_output=False )
    scaler = StandardScaler()
    pca = PCA( svd_solver="full", n_components=0.95, random_state=random_state_seed )
    
    def bmi( df ):
        df["bmi"] = df["weight_kg"]/((df["height_cm"]/100)**2 )
        df.drop(columns=["weight_kg","height_cm"])
        return df
        
    bmi_transformer = FunctionTransformer( bmi )
    
    categorical_columns = data.select_dtypes(include=["object", "category"])
    numerical_columns = data.select_dtypes(exclude=["object", "category"])
    
    preprocessor = ColumnTransformer( transformers = [
            ( "categorical", oh_encoder, categorical_columns.columns ),
            ( "numerical", scaler, numerical_columns.columns ),
            ( "bmi", bmi_transformer, ["height_cm","weight_kg"] ),
        ],
        remainder="passthrough" )
    pipeline = Pipeline(steps=[( "preprocessing", preprocessor ), ( "pca", pca ), ( "training", model )])
    return pipeline

In [6]:
def train_model_pipeline( pipeline_fun, model, X, y ):
    pipeline = pipeline_fun( X, model )
    pipeline.fit( X, y )
    y_pred = pipeline.predict( X )
    return pipeline, MAE( y_pred, y )

def evaluate_pipeline( pipeline, X, y ):
    y_pred = pipeline.predict( X )
    return MAE( y_pred, y )

SGD_pipeline, SGD_MAE_train = train_model_pipeline( pipeline_fun=get_preprocessed_pipeline, model=sgdregressor, X=X_train, y=y_train )
KN_pipeline, KN_MAE_train = train_model_pipeline( pipeline_fun=get_preprocessed_pipeline, model=knregressor, X=X_train, y=y_train )

SGD_MAE_test = evaluate_pipeline( pipeline=SGD_pipeline, X=X_test, y=y_test )
KN_MAE_test = evaluate_pipeline( pipeline=KN_pipeline, X=X_test, y=y_test )

print( f"Our SGD regressor has a MAE of:" )
print( f"Train: {SGD_MAE_train:.6f}" )
print( f"Test: {SGD_MAE_test:.6f}\n" )

print( f"Our KNeighbors regressor has a MAE of:" )
print( f"Train: {KN_MAE_train:.6f}" )
print( f"Test: {KN_MAE_test:.6f}\n" )

Our SGD regressor has a MAE of:
Train: 0.2979
Test: 0.3115

Our KNeighbors regressor has a MAE of:
Train: 0.2598
Test: 0.3371



### GridSearch

In [13]:
from sklearn.model_selection import GridSearchCV

In [None]:
def SGD_Gridsearch(X_train, y_train):
    import warnings
    warnings.filterwarnings('ignore')
    
    #losses = [ 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ]
    #penalties = [ 'l2', 'l1', 'elasticnet', None ]
    #alphas = [ j*10**(i-10) for i in range(15) for j in range(1,10) ]
    #l1_ratios = [ 0.1*i for i in range(11) ] # Only for elasticnet
    # Fit intercept not tested for as we do standard scaling to remove the mean
    # max_iter left standard, no sense in increasing epochs as the model can only converge more
    # tol, much like max_iter this has no effect on model performance. It only stops training when some cut-off is reach which could be destructcive in grid_search
    #shuffle = True # Shuffling should only make our model more robust
    # verbosity unrelated to model
    #epsilons = [ j*10**(i-4) for i in range(5) for j in range(1,10)  ] # Given squared loss and our MAE observations the loss should be in the order of 10^-k for some positive k. Our prediction is that 0<k<4
    #random_state = random_state_seed # For reproducability
    #learning_rates = [ 'constant', 'optimal', 'invscaling', 'adaptive' ]
    #eta0s = [ j*10**(i-6) for i in range(7) for j in range(1,10) ]
    # We leave power_t as the default since we do not know what values would be more appropriate and cannot search (-inf,inf) in a reasonable time
    # early_stopping = False by default
    # validation_fraction is not necessary to be set without early_stopping
    # n_iter_no_change again unnecessary without early_stopping
    #warm_start=False # While it is false by default we set it again as we definitively do not want this since it would be detrimental to the grid search
    # average left false as is by default

    best_MAE = 10**100
    params = {
        "loss": [ 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ], # 4
        "penalty": [ 'l2', 'l1', 'elasticnet', None ], # 4
        "alpha": [ 10**(i-10) for i in range(11) ], # 10
        "l1_ratio": [ 0.1*i for i in range(11) ], # 10
        "epsilon": [ 10**(i-4) for i in range(6) ], # 5
        "learning_rate": [ 'constant', 'optimal', 'invscaling', 'adaptive' ], # 4
        "eta0": [ 10**(i-6) for i in range(7) ], # 7
        "shuffle": [ True ],
        "random_state": [ random_state_seed ],
        "warm_start": [ False ]
    }
    model = GridSearchCV( SGDRegressor(), params )
    _, MAE = train_model_pipeline( get_preprocessed_pipeline, model, X_train, y_train )
    return MAE

print(SGD_Gridsearch(X_train,y_train))    
    

### Autograder 

In the autograder you will need to provide two things: 1) estimate of the MAE of your model on unseen data, 2) the predictions on the autograder data. For the autograder data we only provide the features and not the regression targets. Thus, you cannot compute the MAE on this data yourself - you need to estimate that with the data provided above. 

In [8]:
data_autograder = pd.read_csv('football_autograde.csv')
data_autograder.head()


Unnamed: 0,age,height_cm,weight_kg,nationality_name,overall,potential,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,...,movement_agility,movement_reactions,movement_balance,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,27.0,190.0,78.0,b'England',70.0,71.0,52.0,42.0,75.0,60.0,...,57.0,67.0,61.0,72.0,68.0,15.0,8.0,12.0,13.0,15.0
1,19.0,183.0,76.0,b'Republic of Ireland',59.0,77.0,45.0,20.0,53.0,60.0,...,66.0,55.0,69.0,59.0,57.0,8.0,11.0,10.0,6.0,11.0
2,20.0,172.0,67.0,b'China PR',48.0,53.0,40.0,34.0,38.0,52.0,...,57.0,56.0,70.0,35.0,43.0,12.0,6.0,9.0,7.0,12.0
3,28.0,170.0,65.0,b'Brazil',76.0,76.0,73.0,76.0,52.0,72.0,...,89.0,70.0,88.0,50.0,48.0,12.0,7.0,12.0,10.0,7.0
4,28.0,186.0,74.0,b'England',65.0,67.0,18.0,19.0,15.0,25.0,...,32.0,57.0,57.0,14.0,17.0,66.0,64.0,66.0,63.0,68.0


In [None]:
# TODO Replace this with your own estimate of the MAE of your best model
estimate_MAE_on_new_data = np.array([1.0])

# TODO Replace this with the predictions of your best model
# via e.g. prediction = model.predict(data_autograder)
# your predictions here should again be the $log_{10}(wage)$ of the football player, just as in the provided data. 
predictions_autograder_data = np.array([-1] * 14178)

# Upload this file to the Vocareum autograder:
result = np.append(estimate_MAE_on_new_data, predictions_autograder_data)
pd.DataFrame(result).to_csv("autograder_submission.txt", index=False, header=False)