In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Ridge, LogisticRegression, Lasso, ElasticNet, LinearRegression
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.svm import SVR, SVC
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import json
from joblib import dump, load

### Classes/Pipelines

In [2]:
class FootballPreprocessor(BaseEstimator, TransformerMixin):
    '''
    preprocessor class, dropsmissing, dummies, polynomials, assigns class to target,
    takes target out of dataset, gets coeficient names and makes the train test split
    
        target: str of target column
        to_drop: list of columns to drop
        to_dummy: list of columns to dummy
        test_size: testset as fraction as total dataset
        dropna: bool, True drops rows with np.nan values, default is True
        classify: bool, True uses the assign class function and stratifys the train test split
                  designed for processing data for classifiers
        threshold: int, if y>int return 1. used to create the classes in the target.
        poly: bool, True returns dataset with 2-degree polynomials of original dataset
              the coef_names are adjusted to fit the new number of features.
    '''
    def __init__(self, target, to_drop=None, to_dummy=None, test_size=0.3, dropna=True, classify=False, threshold=None, poly=False, time_split=False,when=77):
        self.to_drop=to_drop
        self.to_dummy=to_dummy
        self.test_size=test_size
        self.target=target
        self.dropna=dropna
        self.classify=classify
        self.threshold=threshold
        self.poly=poly
        self.time_split=time_split
        self.when=when
        
    def fill_missing_teams(self, X):
        '''
        fills missing values with 0
        for teams that cross from train to 
        test set but might not be in all seasons
        '''
        teams=['ARS', 'BOU', 'BHA', 'BUR', 'CHE', 'CRY', 'EVE','NEW',
        'HUD', 'LEI', 'LIV', 'MCI', 'MUN', 'SOU', 'TOT', 'WAT',
       'WHU']
        for team in teams:
            try:
                X[team]=X[team].fillna(value=0)
            except:
                continue
        return X
        
    def dropmissing(self, X):
        '''
        drops missing values, for the designed data set this will
        take out rows without lags. removing the first 2 weeks for
        each player
        '''
        X.dropna(inplace=True)
        return X
        
    def get_target(self, X):
        '''
        pops out target creating a y series
        '''
        y=X.pop(target)
        return X,y
        
    def drop_me(self, X):
        '''
        drops any columns specified
        '''
        droping=self.to_drop
        X=X.drop(droping, axis=1)
        return X
    
    def dummy_me(self, X):
        '''
        dummies any columns specified if position
        or team so doesnt drop first as unkowns 
        and dropout teams will be force
        dropped later
        '''
        if ('position' in self.to_dummy) or ('team' in self.to_dummy):
            dummying = [col for col in self.to_dummy if (col!='position') and (col!='team')]
            X=pd.get_dummies(X,columns=dummying,drop_first=True)
            
            if 'position' in self.to_dummy:
                X=pd.get_dummies(X,columns=['position'])
                
            if 'team' in self.to_dummy:
                X=pd.get_dummies(X,columns=['team'])
        else:
            dummying=self.to_dummy
            X=pd.get_dummies(X,columns=dummying)
        return X
    
    def polyfi(self, X):
        '''
        creates the polynomial of the X features, also gets the coef names
        returns both. no keyword for more than 2 degrees
        '''
        poly=PolynomialFeatures()#2degrees
        coef_names=poly.fit(X).get_feature_names(X.columns)
        coef_names=[col.replace(' ', '_x_') for col in coef_names]
        X=poly.fit_transform(X)
        X=pd.DataFrame(X,columns=coef_names)
        return X
    
    def assign_class(self, row):
        '''
        The .apply function used to classifiy the target
        '''
        if row > self.threshold:
            return 1
        else:
            return 0
        
    def create_class(self, y):
        '''
        function that employs the .apply function to classify y
        '''
        y = y.apply(self.assign_class)
        return y
    
    def get_coef_names(self, X):
        '''
        gets coef names for X features
        '''
        return X.columns
    
    def train_test(self, X, y):
        '''
        makes train test split, if classify == True then uses stratify.
        '''
        if self.classify==True:
            return train_test_split(X,y,test_size=self.test_size,stratify=y,random_state=1)
        else:
            return train_test_split(X,y,test_size=self.test_size)
    
    def make_time_split(self, X, y):
        '''
        default=adj_round 77
        makes the test set the 2018/19 season
        the train set is the 2016/17 and 2017/18 season
        '''
        
        print('X', X.shape)
        
        X_test = X[X['adj_round']>=self.when]
        
        X_train = X[X['adj_round']<self.when]
        
        train_loc = list(X_train.index)
        test_loc = list(X_test.index)
        y_train = y.loc[train_loc]
        y_test = y.loc[test_loc]
        
        X_train=X_train.drop('adj_round', axis=1)
        X_test=X_test.drop('adj_round', axis=1)
        
        print('X_train',X_train.shape)
        print('X_test',X_test.shape)
        return X_train, X_test, y_train, y_test
    
    def drop_unkowns(self, X):
        X = X.drop('position_unkown',axis=1)
        return X
    
    def clear_dummies(self, X):
        drop_dummies=['STO', 'FUL', 'SWA', 'MID', 
        'SUN', 'WOL', 'WBA', 'CAR', 'HUL','unkown']
        to_stay=X.columns
        
        for dum in drop_dummies:
            to_stay=[col for col in to_stay if dum not in col]
        
        to_drop=[col for col in X.columns if col not in to_stay]
        X=X.drop(to_drop,axis=1)
        return X
            
            
    
    def transform(self, X):
        '''
        channels different inputs to the right functions. 
        returns train test split data and the coef_names
        '''
        
        
        
        if self.to_drop != None:
            X=self.drop_me(X)
            
            
        if self.to_dummy != None:
            X=self.dummy_me(X)
            X=self.clear_dummies(X)
            
        X=self.fill_missing_teams(X)
        
        
        if self.dropna==True:
            X=self.dropmissing(X)
            
        
        
       # if 'position' in self.to_dummy:
          #  X = self.drop_unkowns(X)
            
        X,y=self.get_target(X)
        
        if self.poly==True:
            X = self.polyfi(X)
            
        if self.classify == True:
            y=self.create_class(y)
        
        
            
        if self.time_split==False:
            X_train, X_test, y_train, y_test = self.train_test(X,y)
        else:
            X_train, X_test, y_train, y_test = self.make_time_split(X,y)
        
        coef_names=self.get_coef_names(X_train)
            
        return X_train, X_test, y_train, y_test, coef_names
    
    def fit(self, X):
        return self

class ClassifyPlayers(BaseEstimator, TransformerMixin):
    '''
    runs a classification model and any other processes specified by parameters
    
        model: unfitted model object if train set, fitted if test.
        test: bool, True for test set data. if True then no gridsearch or fitting takes place.
              just returns the score of the model on the test set and the predicted probabilites
        params: dict, used in gridsearch, specific to model
        scale_fit: the scale_fit attribute from standardscaler on a fit_transform of the train
                   needed for test set
        mean_fit: the mean_fit attribute from the standardscaler on a fit_transform of the train
                  needed for the test set
        balance: "under", "over", "smote", "adasyn", default None. Specify the imbalance fix method
                 under uses bagging with undersampling boostrapped samples. over takes a random over
                 sampler of the data set and trains the model on that, the score is on the original
                 dataset. smote does the same as over but with the smote algorithm. adasyn does a
                 grid search for the optimal neighbors and a inner gridsearch for the optimal model 
                 for each neighbor, takes a very long time. once the optimal model and neighbors are
                 found the model is fitted on the resampled data. The score is on the original data.
        bag: bool, if True then a bagging classifier is used. User cannot input params for this bag
             without editing code in class. Uses the best estimator from gridsearch as the base
        boost: bool, if True then a adaboost is used. Uses the best etstimator fro the gridsearch
               as the base.
        reduce:float(>0), 0 will perform pca on columns and keep components = columns, None will not
               use pca. 0.3 will reduce the components to cols/1.3. So 1 will half the features of the 
               model.
        fit_pca: fitted pca object, used for test data.
        
    '''
    
    def __init__(self, model, balance=None, params=None, test=False, scale_fit=None, mean_fit=None, fp_tol=None, bag=False, boost=False, reduce=None, fit_pca=None, n_jobs=1,balparams=None):
        self.model = model
        self.test = test
        self.params=params
        self.scale_fit = scale_fit
        self.mean_fit = mean_fit
        #self.classification_thresh = classification_thresh 0.5
        #self.fp_tol = fp_tol None
        self.balance = balance
        self.bag=bag
        self.boost=boost
        self.reduce=reduce
        self.fit_pca=fit_pca
        self.n_jobs=n_jobs
        self.balparams=balparams
    
    def standardize(self, X):
        '''
        standardizes the data, fits and transforms train
        data, transforms test data. returns a numpy array
        '''
        scaler = StandardScaler()
        if self.test == False:
            X=scaler.fit_transform(X)
            return X
        else:
            scaler.scale_=self.scale_fit
            scaler.mean_=self.mean_fit
            X=scaler.transform(X)
            return X
        
    def find_hyper_params(self, X, y):
        '''
        gridsearches the X,y dataset to find optimal
        hyper params. Uses precision as a score metric.
        returns best_estimator
        '''
        gs = GridSearchCV(self.model, self.params,
                         cv=StratifiedKFold(n_splits=5, shuffle=True),
                         n_jobs=self.n_jobs,verbose=1)
        gs.fit(X,y)
        return gs.best_estimator_
    
    def balance_bag_grid(self, X, y):
        '''
        gridsearches on the balanced bagging classifier.
        used on imbalanced datasets. params cannot be set
        outside the code and returns the best_estimator.
        '''
        self.model = BalancedBaggingClassifier(self.model, n_estimators=200,
                                                      boostrap=True, n_jobs=self.n_jobs,
                                                      verbose=1)
        params_imb={
            'max_samples':[1.0,0.7,0.5,0.3],
            'sampling_strategy':[1.0, 0.6, 0.3],
            'replacement':[True]
        }
        gs=GridSearchCV(self.model, params_imb, cv=StratifiedKFold(n_splits=5, shuffle=True),
                        verbose=1)
        gs.fit(X,y)
        return gs.best_estimator_
    
    def bag_me(self):
        '''
        bags and then gridsearches on the base estimator
        returns the best bagged estimator. Doesnt
        use find_hyper_params() becuasue need best_score_
        attribute from the GridSearchCV()
        '''
        bagged_model=BaggingClassifier(base_estimator=self.model,
                                    n_estimators=200,
                                    n_jobs=self.n_jobs)
        
        
        
        return bagged_model
        
    
    def boost_me(self):
        '''
        boosts the base estimator and returns the
        boosted model
        '''
        boosted_model=AdaBoostClassifier(base_estimator=self.model,
                                     n_estimators=100)
        return boosted_model
    
    def pca_me(self, X):
        '''
        if train fits and transforms a pca decomposition onto the
        dataset using the reduction ratio specified in object initilization.
        returns decomposed and reduced data aswell as the pca object
        
        if test uses the pca object specified in initilization and
        transforms the test set with it. returns the decomposed and reduced
        data aswell as the pca object
        '''
        if self.test==False:
            self.reduce+=1
            ncom=int(len(X)//self.reduce)
            pca = PCA(n_components=ncom)
            coef_names=[f'PC_{x}' for x in range(1, pca.n_components+1)]
            X = pca.fit_transform(X)
        else:
            pca = self.fit_pca
            X = pca.transform(X)
        return X, pca
    
    def over_samp_grid(self, sampler, X,y):
        kf=StratifiedKFold(n_splits=5,shuffle=True)
        bal_pipe=make_pipeline_imb(sampler,self.model)
        gs=GridSearchCV(bal_pipe,self.balparams,cv=kf,
                       n_jobs=self.n_jobs,
                       verbose=1)
        gs.fit(X_train,y_train)
        
        return gs.best_estimator_ #opperates like a best model
            
        
            
    def transform(self, X,y):
        '''
        deals with X and y dataset depending on initilization
        paramaters. returns score of dataset and predicted probabilities.
        if train set also returns coefficient values, a fitted model
        object and a fitted pca object, if pca was used.
        '''
        
        params_bag={
            'max_samples':[1.0, 0.7, 0.5, 0.3],
            'max_features':[1.0, 0.9, 0.5]
        }
        
        params_bag_bal={
            'baggingclassifier__max_samples':[1.0, 0.7, 0.5, 0.3],
            'baggingclassifier__max_features':[1.0, 0.9, 0.5]
        }
        
        X = self.standardize(X)
        
        if self.reduce!=None:
            X, pca=self.pca_me(X)
        
        if self.test==False:
            if self.balance=='over':
                sampler=RandomOverSampler()
                self.model=self.over_samp_grid(sampler,X,y)
                if self.bag==True:
                    self.model=self.bag_me()
                    self.params=params_bag_bal
                    self.model=self.over_samp_grid(sampler,X,y)
                
                if self.boost==True:
                    self.model=self.boost_me()
                
                
            elif self.balance=='under':
                #what is happening here!?!?!
                sampler = RandomUnderSampler()
                self.model=self.find_hyper_params(X,y)
            
                self.model=balance_bag_grid(X,y)
                self.model.fit(X,y)
                
            elif self.balance=='smote':
                sampler = SMOTE()
                self.model=self.over_samp_grid(sampler,X,y)
                print(self.model)
                if self.bag==True:
                    self.model=self.bag_me()
                    self.params=params_bag_bal
                    self.model=self.over_samp_grid(sampler,X,y)
                
                if self.boost==True:
                    self.model=self.boost_me()
                
            elif self.balance=='adasyn':
                sampler = ADASYN(n_neighbors=20)
                self.model=self.over_samp_grid(sampler,X,y)
                if self.bag==True:
                    self.model=self.bag_me()
                    self.params=params_bag_bal
                    self.model=self.over_samp_grid(sampler,X,y)
                
                if self.boost==True:
                    self.model=self.boost_me()
                
            else:
                self.model=self.find_hyper_params(X,y)
                if self.bag==True:
                    self.model=self.bag_me()
                    self.params=params_bag
                    self.model=self.find_hyper_params(X,y)
                
                if self.boost==True:
                    self.model=self.boost_me()
                    
                
            self.model.fit(X,y)
            
            
            print(np.mean(cross_val_score(self.model, X, y)))
            print(self.model)
            try:
                if len(self.model.coef_)==1:
                    coef = self.model.coef_[0]
                else:
                    coef = self.model.coef_
            except:
                
                try:
                    coef = self.model.feature_importances_
                except:
                    try:
                        coef = self.model.steps[-1][-1].coef_
                    except:
                        try:
                            coef=self.model.steps[-1][-1].feature_importances_
                        except:
                            coef=np.nan
            proba = [x[1] for x in self.model.predict_proba(X)]
            try:
                return coef, proba, self.model, pca
            except:
                return coef, proba, self.model
        #returns fitted optimal model and optimal threshold
        else:
            print(np.mean(cross_val_score(self.model, X, y)))
            proba = [x[1] for x in self.model.predict_proba(X)]
            return proba
        
        #returns score from a fitted model
    
    def fit(self, X,y):
        return self
    

class RegressPlayers(BaseEstimator, TransformerMixin):
    '''
    runs a classification model and any other processes specified by parameters
    
        model: unfitted model object if train set, fitted if test.
        test: bool, True for test set data. if True then no gridsearch or fitting takes place.
              just returns the score of the model on the test set and the predicted probabilites
        params: dict, used in gridsearch, specific to model
        scale_fit: the scale_fit attribute from standardscaler on a fit_transform of the train
                   needed for test set
        mean_fit: the mean_fit attribute from the standardscaler on a fit_transform of the train
                  needed for the test set
        bag: bool, if True then a bagging classifier is used. User cannot input params for this bag
             without editing code in class. Uses the best estimator from gridsearch as the base
        boost: bool, if True then a adaboost is used. Uses the best etstimator fro the gridsearch
               as the base.
        reduce:float(>0), 0 will perform pca on columns and keep components = columns, None will not
               use pca. 0.3 will reduce the components to cols/1.3. So 1 will half the features of the 
               model.
        fit_pca: fitted pca object, used for test data.
    '''
    
    def __init__(self, model, params=None, test=False, scale_fit=None, mean_fit=None, bag=False, boost=False, reduce=None, fit_pca=None, n_jobs=1):
        self.model=model
        self.params=params
        self.test=test
        self.scale_fit=scale_fit
        self.mean_fit=mean_fit
        self.bag=bag
        self.boost=boost
        self.reduce=reduce
        self.fit_pca=fit_pca
        self.n_jobs=n_jobs
        
    def standardize(self, X):
        '''
        standardizes the data, fits and transforms train
        data, transforms test data. returns a numpy array
        '''
        scaler = StandardScaler()
        if self.test == False:
            X=scaler.fit_transform(X)
            return X
        else:
            scaler.scale_=self.scale_fit
            scaler.mean_=self.mean_fit
            X=scaler.transform(X)
            return X
        
    def find_hyper_params(self, X, y):
        '''
        gridsearches the X,y dataset to find optimal
        hyper params. Uses precision as a score metric.
        returns best_estimator
        '''
        gs = GridSearchCV(self.model, self.params,
                         cv=KFold(n_splits=5, shuffle=True),
                         verbose=1, n_jobs=self.n_jobs)
        gs.fit(X,y)
        return gs.best_estimator_
    
    def bag_me(self, X, y):
        '''
        bags and then gridsearches on the base estimator
        returns the best bagged estimator. Doesnt
        use find_hyper_params() becuasue need best_score_
        attribute from the GridSearchCV()
        '''
        self.model=BaggingRegressor(base_estimator=self.model,
                                    n_estimators=900, n_jobs=self.n_jobs)
        
        params_bag={
            'max_samples':[1.0, 0.7, 0.5],
            'max_features':[1.0, 0.9, 0.5]
        }
        
        gs = GridSearchCV(self.model, params_bag,
                         cv=KFold(n_splits=5, shuffle=True))
        
        gs.fit(X,y)
        return gs.best_estimator_
        
    
    def boost_me(self, X, y):
        '''
        boosts the base estimator and returns the
        boosted model
        '''
        boosted_model=AdaBoostRegressor(base_estimator=self.model,
                                       n_estimators=10000)
        return boosted_model
    
    def pca_me(self, X):
        '''
        if train fits and transforms a pca decomposition onto the
        dataset using the reduction ratio specified in object initilization.
        returns decomposed and reduced data aswell as the pca object
        
        if test uses the pca object specified in initilization and
        transforms the test set with it. returns the decomposed and reduced
        data aswell as the pca object
        '''
        if self.test==False:
            self.reduce+=1
            ncom=int(len(X[0])//self.reduce)
            pca = PCA(n_components=ncom)
            coef_names=[f'PC_{x}' for x in range(1, pca.n_components+1)]
            X = pca.fit_transform(X)
        else:
            pca = self.fit_pca
            X = pca.transform(X)
        return X, pca
    
    def transform(self, X, y):
        '''
        deals with X and y dataset depending on initilization
        paramaters. returns score of dataset and predicted probabilities.
        if train set also returns coefficient values, a fitted model
        object and a fitted pca object, if pca was used.
        '''
        X = self.standardize(X)
        
        if self.reduce!=None:
            X, pca = self.pca_me(X)
        elif (self.fit_pca!=None) and (self.test==True):
            X, pca = self.pca_me(X)
        
        if self.test==False:
            
            self.model=self.find_hyper_params(X,y)
            
            if self.bag==True:
                self.model=self.bag_me(X, y)
            elif self.boost==True:
                self.model=self.boost_me(X,y)
                
            self.model.fit(X,y)
            
            
            print(np.mean(cross_val_score(self.model, X, y)))
            print(self.model)
            try:
                if len(self.model.coef_)==1:
                    coef = self.model.coef_[0]
                else:
                    coef = self.model.coef_
                #check if right
            except:
                try:
                    coef = self.model.feature_importances_
                except:
                    print('coef is empty!')
                    coef=[]
            pred = self.model.predict(X)
            try:
                return coef, pred, self.model, pca
            except:
                return coef, pred, self.model
        #returns fitted optimal model and optimal threshold
        else:
            print(np.mean(cross_val_score(self.model, X, y)))
            pred = self.model.predict(X)
            return pred
    
    def fit(self):
        return self



### Linear Regressions

In [164]:
data = pd.read_csv('linear_data.csv')

to_drop=['Unnamed: 0']
target='next_week_points'

footprep=FootballPreprocessor(target=target, to_drop=to_drop,
                             time_split=True)

X_train, X_test, y_train, y_test, coef_names = footprep.transform(data)

scale_fit=StandardScaler().fit(X_train).scale_
mean_fit=StandardScaler().fit(X_train).mean_

X (57809, 19)
X_train (40949, 18)
X_test (16860, 18)


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [31]:
lin_params={
    'fit_intercept':[True,False]
}

lin_model=LinearRegression()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_params,
                         n_jobs=3)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


0.28671106891766346
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
0.26773084320020074


[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    2.4s finished


In [32]:
dump(lin_opt_model, 'linear_model')

['linear_model']

In [36]:
lin_r_params={
    'alpha':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=Ridge()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_r_params,
                         n_jobs=3)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits
0.2867318188785422
Ridge(alpha=46.41588833612782, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
0.2677557427956227


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    1.2s finished


In [37]:
dump(lin_opt_model, 'ridge_model')

['ridge_model']

In [38]:
lin_l_params={
    'alpha':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=Lasso()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_l_params,
                         n_jobs=3)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=3)]: Done  48 tasks      | elapsed:    2.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.7s finished


0.28672564147768226
Lasso(alpha=0.0001291549665014884, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
0.2677421448105573




In [39]:
dump(lin_opt_model, 'lasso_model')

['lasso_model']

In [41]:
lin_e_params={
    'alpha':np.logspace(-5,5,10),
    'l1_ratio':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=ElasticNet()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_e_params,
                         n_jobs=3)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   13.0s
[Parallel(n_jobs=3)]: Done 200 tasks      | elapsed:   47.7s


0.28673046030938076
ElasticNet(alpha=0.0016681005372000592, copy_X=True, fit_intercept=True,
      l1_ratio=0.2782559402207126, max_iter=1000, normalize=False,
      positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
0.2677670635991219


[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:  1.3min finished


In [43]:
dump(lin_opt_model, 'enet_model')

['enet_model']

Ridge is the best model by a margin. So I use Ridge and Lasso in bagging and boosting ensembles.

In [54]:
lin_r_params={
    'alpha':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=Ridge()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_r_params,
                         n_jobs=3,
                         boost=True)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    1.0s finished


0.0436622094206068
AdaBoostRegressor(base_estimator=Ridge(alpha=46.41588833612782, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
         learning_rate=1.0, loss='linear', n_estimators=1000000,
         random_state=None)
coef is empty!
0.19061978472332466




In [56]:
dump(lin_opt_model,'boosted_ridge_model')

['boosted_ridge_model']

In [57]:
lin_l_params={
    'alpha':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=Lasso()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_l_params,
                         n_jobs=3,
                         boost=True)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=3)]: Done  49 tasks      | elapsed:    2.6s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    3.4s finished


0.024484677121300275
AdaBoostRegressor(base_estimator=Lasso(alpha=0.0001291549665014884, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
         learning_rate=1.0, loss='linear', n_estimators=10000,
         random_state=None)
coef is empty!




0.194226275665994




In [58]:
dump(lin_opt_model,'boosted_lasso_model')

['boosted_lasso_model']

In [64]:
lin_r_params={
    'alpha':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=Ridge()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_r_params,
                         n_jobs=3,
                         bag=True)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    3.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    6.9s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.6s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    8.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.8s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend wi

[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.1s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    5.6s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.1s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    5.6s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concu

[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concu

0.286715552916115
BaggingRegressor(base_estimator=Ridge(alpha=46.41588833612782, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.7, n_estimators=900, n_jobs=3, oob_score=False,
         random_state=None, verbose=1, warm_start=False)
coef is empty!


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concu

0.2677213362173199


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished


In [65]:
dump(lin_opt_model,'bagged_ridge_model')

['bagged_ridge_model']

In [66]:
lin_l_params={
    'alpha':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=Lasso()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_l_params,
                         n_jobs=3,
                         bag=True)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   53.6s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.6s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   54.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.7s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.8s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concu

[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  2.0min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  1.4min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.2s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concu

[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   11.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   13.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concu

0.2867190021163802
BaggingRegressor(base_estimator=Lasso(alpha=0.0001291549665014884, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.7, n_estimators=900, n_jobs=3, oob_score=False,
         random_state=None, verbose=1, warm_start=False)
coef is empty!


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.9s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   13.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   15.1s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   11.9s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concu

0.26769876217307426


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished


In [67]:
dump(lin_opt_model,'bagged_lasso_model')

['bagged_lasso_model']

# Now to create the lagged error feature

In [217]:
def standardize(X_train, X_test):
    scaler=StandardScaler()
    X_train_ss=scaler.fit_transform(X_train)
    X_test_ss=scaler.transform(X_test)
    return X_train_ss, X_test_ss

def nan_first_error_lag(row, df):
    '''
    tries to slice the dataframe at the index
    above the row, if its the first row of a player
    this will through an error as the index doesnt
    exist (due to dropping these rows ealier). Hence,
    the error term is the error of another player and 
    should not be included and is made a np.nan. If the
    row above can be sliced then the error is returned 
    unchanged.
    '''
    i=row.name
    try:
        df.loc[i-1]
        not_first=True
    except:
        not_first=False
    
    if not_first==False:
        return np.nan
    else:
        return row['error_lag_1']

def get_error_lags(model_filename, data_filename):
    '''
    gets the lagged errors in the X_train and X_test
    datasets and drops the first row of each player so
    that another players lagged error is not assigned
    wrongly. Requires a stored model.
    '''
    
    data = pd.read_csv(data_filename)

    to_drop=['Unnamed: 0']
    target='next_week_points'

    footprep=FootballPreprocessor(target=target, to_drop=to_drop,
                                 time_split=True)

    X_train, X_test, y_train, y_test, coef_names = footprep.transform(data)
    
    model=load(model_filename)
    
    X_train_ss, X_test_ss=standardize(X_train,X_test)
    
    y_train_pred=model.predict(X_train_ss)
    y_test_pred=model.predict(X_test_ss)
    
    error_train=y_train - y_train_pred
    error_test=y_test - y_test_pred
    
    #We now have the error columns, but that cannot be simply concated onto 
    #the dataframe and shifted down. The errors of another player will not 
    #be informative to a new player. We must drop the first row of every player 
    #after the shifted errors are added.
    
    X_train['errors']=error_train
    X_test['errors']=error_test
    
    X_train['error_lag_1']=X_train['errors'].shift()
    X_test['error_lag_1']=X_test['errors'].shift()
    
    X_train['next_week_points']=y_train
    X_test['next_week_points']=y_test
    #done so that the rows dropped from the features
    #have their targets removed aswell.
    
    
    X_train['error_lag_1']=X_train.apply(nan_first_error_lag,axis=1,df=X_train)
    X_test['error_lag_1']=X_test.apply(nan_first_error_lag,axis=1,df=X_test)
    
    X_train.drop('errors', axis=1, inplace=True)
    X_test.drop('errors', axis=1, inplace=True)
    
    X_train.dropna(inplace=True)
    X_test.dropna(inplace=True)
    
    y_train=X_train.pop('next_week_points')
    y_test=X_test.pop('next_week_points')
    
    
    print('X_train: ',X_train.shape)
    print('y_train: ',y_train.shape)
    print('X_test: ',X_test.shape)
    print('y_test: ',y_test.shape)
    
    return X_train, X_test, y_train, y_test
    
    
    
    

We now have the lagged error of a players previous game in each row. Accounting for the MA(1) time dynamic.

In [218]:
X_train,X_test,y_train,y_test=get_error_lags('linear_model','linear_data.csv')

X (57809, 19)
X_train (40949, 18)
X_test (16860, 18)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


X_train:  (39638, 19)
y_train:  (39638,)
X_test:  (16251, 19)
y_test:  (16251,)


In [219]:
scale_fit=StandardScaler().fit(X_train).scale_
mean_fit=StandardScaler().fit(X_train).mean_

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [220]:
lin_params={
    'fit_intercept':[True,False]
}

lin_model=LinearRegression()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_params,
                         n_jobs=3)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of  10 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.4s finished


0.2878056661994199
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
0.26746939252930785


In [221]:
dump(lin_opt_model, 'ma_linear_model')

['ma_linear_model']

In [222]:
X_train,X_test,y_train,y_test=get_error_lags('boosted_ridge_model',
                                             'linear_data.csv')

X (57809, 19)
X_train (40949, 18)
X_test (16860, 18)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


X_train:  (39638, 19)
y_train:  (39638,)
X_test:  (16251, 19)
y_test:  (16251,)


In [223]:
lin_r_params={
    'alpha':np.logspace(-5,5,10),
    'fit_intercept':[True,False]
}

lin_model=Ridge()

lin_train=RegressPlayers(model=lin_model,
                         params=lin_r_params,
                         n_jobs=3,
                         boost=True)

lin_coef, lin_pred_train, lin_opt_model=lin_train.transform(X_train,
                                                           y_train)

lin_test = RegressPlayers(model=lin_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

lin_pred_test=lin_test.transform(X_test,y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    1.5s finished


0.01685074110520953
AdaBoostRegressor(base_estimator=Ridge(alpha=46.41588833612782, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
         learning_rate=1.0, loss='linear', n_estimators=10000,
         random_state=None)
coef is empty!
0.18920617485924088




In [224]:
dump(lin_opt_model, 'ma_boosted_ridge_model')

['ma_boosted_ridge_model']

# Tree Regressions

In [242]:
data = pd.read_csv('tree_data.csv')

to_drop=['Unnamed: 0']
target='next_week_points'

footprep=FootballPreprocessor(target=target, to_drop=to_drop,
                             time_split=True)

X_train, X_test, y_train, y_test, coef_names = footprep.transform(data)

scale_fit=StandardScaler().fit(X_train).scale_
mean_fit=StandardScaler().fit(X_train).mean_

X (57809, 69)
X_train (40949, 68)
X_test (16860, 68)


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [244]:
tree_params={
    'splitter':['best','random'],
    'min_samples_split':[500,100,50,30],
    'max_features':['auto', 'sqrt'],
    'max_leaf_nodes':[None, 4, 2]
}

tree_model=DecisionTreeRegressor()

tree_train=RegressPlayers(model=tree_model,
                          params=tree_params,
                          n_jobs=3)

tree_coef, tree_pred_train, tree_opt_model=tree_train.transform(X_train,
                                                                y_train)

tree_test=RegressPlayers(model=tree_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

tree_pred_test=tree_test.transform(X_test,
                                   y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   16.8s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   28.7s
[Parallel(n_jobs=3)]: Done 240 out of 240 | elapsed:   30.9s finished


0.25264252488452105
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=500, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='random')
0.2323853388548722




In [247]:
dump(tree_opt_model, 'tree_reg_model')

['tree_reg_model']

In [251]:
tree_params={
    'splitter':['best','random'],
    'min_samples_split':[500,100,50,30],
    'max_features':['auto', 'sqrt'],
    'max_leaf_nodes':[None, 4, 2]
}

tree_model=DecisionTreeRegressor()

tree_train=RegressPlayers(model=tree_model,
                          params=tree_params,
                          n_jobs=3,
                          boost=True)

tree_coef, tree_pred_train, tree_opt_model=tree_train.transform(X_train,
                                                                y_train)

tree_test=RegressPlayers(model=tree_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

tree_pred_test=tree_test.transform(X_test,
                                   y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   15.6s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   28.8s
[Parallel(n_jobs=3)]: Done 240 out of 240 | elapsed:   31.5s finished


-0.7150704001000022
AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=500, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='random'),
         learning_rate=1.0, loss='linear', n_estimators=10000,
         random_state=None)




-0.8453912545663808


In [252]:
dump(tree_opt_model, 'boosted_tree_reg_model')

['boosted_tree_reg_model']

In [253]:
forest_params={
    'max_depth':[None,20,10,5,3,2],
    'min_samples_split':[500,100,50,30],
    'max_features':['auto','sqrt',10]
}

forest_model=RandomForestRegressor(n_estimators=900,
                                   n_jobs=3)

forest_train=RegressPlayers(model=forest_model,
                           params=forest_params,
                           n_jobs=1)

forest_coef, forest_pred_train, forest_opt_model=forest_train.transform(X_train,
                                                                        y_train)

forest_test=RegressPlayers(model=forest_opt_model,
                          test=True,
                          scale_fit=scale_fit,
                          mean_fit=mean_fit)

forest_pred_test=forest_test.transform(X_test,
                                       y_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


KeyboardInterrupt: 

In [None]:
dump(forest_opt_model, 'forest_reg_model')

# Add MA(1)

In [245]:
X_train,X_test,y_train,y_test=get_error_lags('forest_reg_model','tree_data.csv')

# SVM Regression

In [254]:
data = pd.read_csv('linear_data.csv')

to_drop=['Unnamed: 0']
target='next_week_points'

footprep=FootballPreprocessor(target=target, to_drop=to_drop,
                             time_split=True)

X_train, X_test, y_train, y_test, coef_names = footprep.transform(data)

scale_fit=StandardScaler().fit(X_train).scale_
mean_fit=StandardScaler().fit(X_train).mean_

X (57809, 19)
X_train (40949, 18)
X_test (16860, 18)


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [None]:
svr_params={
    'kernel':['poly','rbf','sigmoid','linear'],
    'gamma':np.logspace(-10,10,10),
    'coef0':np.logspace(-5,5,10),
    'C':np.logspace(-5,5,10),
    'epsilon':np.logspace(-5,5,10)
}

svr_model=SVR()

svr_train=RegressPlayers(model=svr_model,
                         params=svr_params,
                         n_jobs=3)

svr_coef, svr_pred_train, svr_opt_model=svr_train.transform(X_train,
                                                            y_train)

svr_test=Regress_Players(model=svr_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

svr_pred_test=svr_test.transform(X_train,
                                 y_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Fitting 5 folds for each of 40000 candidates, totalling 200000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 26.7min


In [None]:
dump(svr_opt_model, 'svr_model')

# Classifiers

In [3]:
data = pd.read_csv('linear_data.csv')

to_drop=['Unnamed: 0']
target='next_week_points'

footprep=FootballPreprocessor(target=target, to_drop=to_drop,
                             time_split=True,classify=True,threshold=3)

X_train, X_test, y_train, y_test, coef_names = footprep.transform(data)

scale_fit=StandardScaler().fit(X_train).scale_
mean_fit=StandardScaler().fit(X_train).mean_

X (57809, 19)
X_train (40949, 18)
X_test (16860, 18)


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [7]:
y_train.value_counts()

0    36049
1     4900
Name: next_week_points, dtype: int64

# Logistic Regression

In [None]:
log_params={
}

log_model=LogisticRegression()

log_train=ClassifyPlayers(model=log_model,
                         params=log_params,
                         n_jobs=3)

log_coef, log_pred_train, log_opt_model=log_train.transform(X_train,
                                                            y_train)

log_test=ClassifyPlayers(model=log_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

log_pred_test=log_test.transform(X_train,
                                 y_train)

In [None]:
dump(log_opt_model, 'log_model')

In [None]:
X_train_ma,X_test_ma,y_train,y_test=get_error_lags('log_model',
                                                     'linear_data.csv')

In [None]:
log_params={
}

log_model=LogisticRegression()

log_train=ClassifyPlayers(model=log_model,
                         params=log_params,
                         n_jobs=3)

log_coef, log_pred_train, log_opt_model=log_train.transform(X_train_ma,
                                                            y_train)

log_test=ClassifyPlayers(model=log_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

log_pred_test=log_test.transform(X_train_ma,
                                 y_train)

In [None]:
dump(log_opt_model, 'ma_log_model')

# Tree Classifications

In [None]:
data = pd.read_csv('tree_data.csv')

to_drop=['Unnamed: 0']
target='next_week_points'

footprep=FootballPreprocessor(target=target, to_drop=to_drop,
                             time_split=True,classify=True,threshold=3)

X_train, X_test, y_train, y_test, coef_names = footprep.transform(data)

scale_fit=StandardScaler().fit(X_train).scale_
mean_fit=StandardScaler().fit(X_train).mean_

In [None]:
tree_params={
}

tree_model=DecisionTreeClassifier()

tree_train=ClassifyPlayers(model=tree_model,
                          params=tree_params,
                          n_jobs=3)

tree_coef, tree_pred_train, tree_opt_model=tree_train.transform(X_train,
                                                                y_train)

tree_test=ClassifyPlayers(model=tree_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

tree_pred_test=tree_test.transform(X_test,
                                   y_test)

In [None]:
dump(tree_opt_model, 'tree_class_model')

In [None]:
X_train_ma,X_test_ma,y_train,y_test=get_error_lags('tree_class_model',
                                                     'tree_data.csv')

In [None]:
tree_params={
}

tree_model=DecisionTreeClassifier()

tree_train=ClassifyPlayers(model=tree_model,
                          params=tree_params,
                          n_jobs=3)

tree_coef, tree_pred_train, tree_opt_model=tree_train.transform(X_train_ma,
                                                                y_train)

tree_test=ClassifyPlayers(model=tree_opt_model,
                         test=True,
                         scale_fit=scale_fit,
                         mean_fit=mean_fit)

tree_pred_test=tree_test.transform(X_test_ma,
                                   y_test)

In [None]:
dump(tree_opt_model, 'ma_tree_class_model')

In [None]:
forest_params={
}

forest_model=RandomForestClassifier(n_estimators=300,
                                   n_jobs=3)

forest_train=ClassifyPlayers(model=forest_model,
                           params=forest_params,
                           n_jobs=1)

forest_coef, forest_pred_train, forest_opt_model=forest_train.transform(X_train,
                                                                        y_train)

forest_test=ClassifyPlayers(model=forest_opt_model,
                          test=True,
                          scale_fit=scale_fit,
                          mean_fit=mean_fit)

forest_pred_test=forest_test.transform(X_test,
                                       y_test)

In [None]:
dump(forest_opt_model, 'forest_class_model')

In [None]:
X_train_ma,X_test_ma,y_train,y_test=get_error_lags('forest_class_model',
                                                     'tree_data.csv')

In [None]:
forest_params={
}

forest_model=RandomForestClassifier(n_estimators=300,
                                   n_jobs=3)

forest_train=ClassifyPlayers(model=forest_model,
                           params=forest_params,
                           n_jobs=1)

forest_coef, forest_pred_train, forest_opt_model=forest_train.transform(X_train_ma,
                                                                        y_train)

forest_test=ClassifyPlayers(model=forest_opt_model,
                          test=True,
                          scale_fit=scale_fit,
                          mean_fit=mean_fit)

forest_pred_test=forest_test.transform(X_test_ma,
                                       y_test)

In [None]:
dump(forest_opt_model, 'ma_forest_class_model')