In [1]:
from vars import *
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import re
import numpy as np

## Loading the data

In [2]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')  

### Feature Engineering

Jeff has added team name and opposing team name in the dataset. We're also interested in compiling information on a given player's salary, average points, max/min points, and point variance.

In [4]:
# this code will compile average player statistics for the training data
player_stats = train.groupby('PLAYER')['MISC FPTS'].agg(
    AVG_FPTS = 'mean',
    MIN_FPTS = 'min',
    MAX_FPTS = 'max',
    VAR_FPTS = 'std'
).reset_index()

train = pd.merge(train, player_stats, on='PLAYER', how='left')

player_stats = test.groupby('PLAYER')['MISC FPTS'].agg(
    AVG_FPTS = 'mean',
    MIN_FPTS = 'min',
    MAX_FPTS = 'max',
    VAR_FPTS = 'std'
).reset_index()

test = pd.merge(test, player_stats, on='PLAYER', how='left')

We will split the data into quarterbacks and all other players (running backs, tight ends, and wide receivers).

In [9]:
qb_data = data[data['POS'] == 'qb']
qb_data = qb_data.drop(columns=[
    'POS RANK', 'POS', 'MISC G', 'MISC FL', 'MISC ROST', 'MISC FPTS/G', 'RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
    'RECEIVING LG', 'RECEIVING 20+', 'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
    'RUSHING 20+', 'DATE', 'YEAR', 'WEIGHT'
])

We'll want to store our response variable (fantasy points) separately.

In [10]:
qb_y = qb_data['MISC FPTS']
qb_data = qb_data.drop(columns=['MISC FPTS'])

In [11]:
qb_data.columns

Index(['PLAYER', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS',
       'PASSING Y/A', 'PASSING TD', 'PASSING INT', 'PASSING SACKS',
       'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'WEEK', 'TEAM', 'OPP',
       'AVG_FPTS', 'MIN_FPTS', 'MAX_FPTS', 'VAR_FPTS'],
      dtype='object')

### Standard Scaling

One hot encoding for categorical variables.

In [14]:
qb_data = pd.get_dummies(qb_data, columns=['TEAM', 'OPP', 'PLAYER'])

{}

In [None]:
scaler = StandardScaler()
# change this to only use the train data
scaler.fit_transform(qb_data)
# scaler.transform(test data)

PCA code:

In [None]:
# PCA code
pca = PCA(n_components=0.95)
pca.fit(qb_data)


In [None]:
# other_data = data[data['POS'] != 'qb']
# other_data = other_data.drop(columns=[
#     'POS RANK', 'MISC G', 'MISC FL', 'MISC ROST', 'MISC FPTS/G', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 
#     'PASSING Y/A', 'PASSING TD', 'PASSING INT', 'PASSING SACKS', 'DATE', 'YEAR', 'WEIGHT'
# ])


# # Define the list of variables to predict :)
# var_list = ['PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 'PASSING Y/A', 'PASSING TD', 'PASSING INT',
#         'PASSING SACKS', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']

# if(pos == 'rb'):
#     # Define the list of variables to drop and predict
#     data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 
#     'PASSING Y/A', 'PASSING TD', 'PASSING INT', 'RECEIVING LG', 'RECEIVING 20+',
#     'PASSING SACKS', 'YEAR', 'WEIGHT', 'DATE'])
#     var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
#     'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
#     'RUSHING 20+', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']
#     if(pos == 'wr'):
#         # Define the list of variables to drop and predict
#         data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 
#         'PASSING Y/A', 'PASSING TD', 'PASSING INT',
#         'PASSING SACKS', 'YEAR', 'WEIGHT', 'DATE', 'RUSHING Y/A', 'RUSHING LG', 'RUSHING 20+'])
#         var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
#         'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
#         'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']
#     if(pos == 'te'):
#         # Define the list of variables to drop and predict
#         data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 
#         'PASSING Y/A', 'PASSING TD', 'PASSING INT',
#         'PASSING SACKS', 'YEAR', 'WEIGHT', 'DATE', 'RUSHING Y/A', 'RUSHING LG', 'RUSHING 20+'])
#         var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
#         'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
#         'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']