In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

def sort_dataset(dataset_df):
    df_sort = dataset_df.sort_values(by='p_year', ascending=True)
    return df_sort

def split_dataset(dataset_df):    
    labels = dataset_df['salary'] * 0.001
    train = dataset_df.iloc[:1718, :]
    test = dataset_df.iloc[1718:, :]
    Y_train = labels.iloc[:1718]
    Y_test = labels.iloc[1718:]
    X_train = train.drop('salary', axis=1)
    X_test = test.drop('salary', axis=1)
    return X_train, X_test, Y_train, Y_test

def extract_numerical_cols(dataset_df):
    numerical_cols = ['age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'HBP', 'SO', 'GDP', 'fly', 'war']
    numerical_df = dataset_df[numerical_cols]
    return numerical_df

def train_predict_decision_tree(X_train, Y_train, X_test):
    model = DecisionTreeRegressor()
    model.fit(X_train, Y_train)
    dt_predictions = model.predict(X_test)
    return dt_predictions

def train_predict_random_forest(X_train, Y_train, X_test):
    rfmodel = RandomForestRegressor()
    rfmodel.fit(X_train, Y_train)
    rf_predictions = rfmodel.predict(X_test)
    return rf_predictions

def train_predict_svm(X_train, Y_train, X_test):
    svm_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVR())
    ])
    svm_pipeline.fit(X_train, Y_train)
    svm_predictions = svm_pipeline.predict(X_test)
    
    return svm_predictions

def calculate_RMSE(labels, predictions):
    rmse = np.sqrt(mean_squared_error(labels, predictions))
    return rmse

if __name__=='__main__':
    # DO NOT MODIFY THIS FUNCTION UNLESS PATH TO THE CSV MUST BE CHANGED.
    data_df = pd.read_csv('2019_kbo_for_kaggle_v2.csv')
    
    sorted_df = sort_dataset(data_df)    
    X_train, X_test, Y_train, Y_test = split_dataset(sorted_df)
    
    X_train = extract_numerical_cols(X_train)
    X_test = extract_numerical_cols(X_test)

    dt_predictions = train_predict_decision_tree(X_train, Y_train, X_test)
    rf_predictions = train_predict_random_forest(X_train, Y_train, X_test)
    svm_predictions = train_predict_svm(X_train, Y_train, X_test)
    
    print("Decision Tree Test RMSE: ", calculate_RMSE(Y_test, dt_predictions))    
    print("Random Forest Test RMSE: ", calculate_RMSE(Y_test, rf_predictions))    
    print("SVM Test RMSE: ", calculate_RMSE(Y_test, svm_predictions))

Decision Tree Test RMSE:  31.16767000990215
Random Forest Test RMSE:  23.3533507844449
SVM Test RMSE:  32.3804844983029
