# Rocket League Notebook 4: Feature Engineering

## Goals 

- Create models using engineered features

## Results

- 

## Imports

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## Converter

In [16]:
converter = { 'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6 }

## Read in

In [17]:
matches = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Formulas

In [18]:
def filter_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lowbound = Q1-1.5*IQR
    highbound = Q3+1.5*IQR
    df_bounded = df[(df[col] >= lowbound) & (df[col] <= highbound)]

    return df_bounded

## Create columns for model

Also include

    - duration
    - percent_supersonic_speed

In [19]:
matches_plus = matches.assign(
            score_per_second = lambda x: x['score']/x['duration'],
            lowvhigh = lambda x: x['percent_low_air']/x['percent_high_air'],
            percent_boost_50_100 = lambda x: x['percent_boost_50_75']+x['percent_boost_75_100'],
            goals_saves_pm = lambda x: (x['goals']+x['saves'])*60/x['duration'],
            save_prop = lambda x: x['saves']/x['shots_against']
    ).fillna(0)

In [20]:
variables = ['duration',
            'percent_supersonic_speed',
            'score_per_second',
            'lowvhigh',
            'percent_boost_50_100',
            'goals_saves_pm',
            'save_prop'
            ]

matches_prepped = matches_plus[['rank']+variables].replace([np.inf, -np.inf], 0)

for var in variables:

    matches_prepped = filter_outliers(matches_prepped, var)

matches_prepped.shape

(54213, 7)

In [21]:
X = matches_prepped[variables]
y = matches_prepped['rank']

X_train, X_test, y_train, y_test =train_test_split(X, y, random_state=42, stratify = y)

pipe = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())
    ])

param_grid = {'logreg__C':[100, 10, 1.0, 0.1, 0.01]}

gs = GridSearchCV(pipe, param_grid, scoring = 'accuracy', verbose = 2)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ......................................logreg__C=100; total time=   0.7s
[CV] END ......................................logreg__C=100; total time=   0.7s
[CV] END ......................................logreg__C=100; total time=   0.7s
[CV] END ......................................logreg__C=100; total time=   0.9s
[CV] END ......................................logreg__C=100; total time=   0.9s
[CV] END .......................................logreg__C=10; total time=   0.7s
[CV] END .......................................logreg__C=10; total time=   0.7s
[CV] END .......................................logreg__C=10; total time=   0.7s
[CV] END .......................................logreg__C=10; total time=   0.9s
[CV] END .......................................logreg__C=10; total time=   1.1s
[CV] END ......................................logreg__C=1.0; total time=   0.7s
[CV] END ......................................lo

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'logreg__C': [100, 10, 1.0, 0.1, 0.01]},
             scoring='accuracy', verbose=2)

In [22]:
y_pred = gs.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy Score:  0.39272539471742657
Confusion Matrix: 
 [[   0    3    3  128   30   97]
 [   0 1473  734   50  420    1]
 [   0  938  980  295  951   10]
 [   0   95  332 1299  938  144]
 [   0  345  722  918 1384   38]
 [   0   15   56  722  246  187]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report: 
               precision    recall  f1-score   support

      bronze       0.00      0.00      0.00       261
    champion       0.51      0.55      0.53      2678
     diamond       0.35      0.31      0.33      3174
        gold       0.38      0.46      0.42      2808
    platinum       0.35      0.41      0.38      3407
      silver       0.39      0.15      0.22      1226

    accuracy                           0.39     13554
   macro avg       0.33      0.31      0.31     13554
weighted avg       0.38      0.39      0.38     13554



  _warn_prf(average, modifier, msg_start, len(result))
