In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler


def get_seasons(data, start_season, num_seasons):
    return data[380*start_season:380*start_season+380*num_seasons]    

def delete_first_3_weeks(data):    
    # Remove first 3 matchweeks
    return data[data.MW > 3]

def drop_basic_columns(data, basic_columns):    
    maintain = [item for item in data.columns if item not in basic_columns]
    return data[maintain]

def drop_teams_onehot(data):
    onehot_columns = []
    for col in data.columns:
        if col.find("H_") >= 0 or col.find("A_") >= 0:
            onehot_columns.append(col)
    maintain = [item for item in data.columns if item not in onehot_columns]
    return data[maintain]

def odds_to_prob(data):
    data['IWH'] = 1/data['IWH']
    data['IWD'] = 1/data['IWA']
    data['IWD'] = 1/data['IWA']
    return data

def extract_pca(data):    
    features = np.vstack(data[['HTP','ATP']].values)

    pca = PCA().fit(features)
    data['pca0'] = pca.transform(data[['HTP','ATP']])[:, 0]   
    data['pca1'] = pca.transform(data[['HTP','ATP']])[:, 1]
    
    return data

# Standardising the data.
def scale_features(data, to_scale, scalerr=None):    
    cols = [to_scale]

    if (scalerr == None):
        scalerr = StandardScaler()
        scalerr.fit(data[to_scale])

    for col in cols:
        #X_all[col] = scale(X_all[col])
        data[col] = scalerr.transform(data[col])
    return data, scalerr

def fill_nan(data):
    data['IWH'] = data['IWH'].fillna(2.0)
    data['IWD'] = data['IWD'].fillna(4.0)
    data['IWA'] = data['IWA'].fillna(4.0)
    return data

def form_to_str(data):
    data.HM1 = data.HM1.astype('str')
    data.HM2 = data.HM2.astype('str')
    data.HM3 = data.HM3.astype('str')
    data.AM1 = data.AM1.astype('str')
    data.AM2 = data.AM2.astype('str')
    data.AM3 = data.AM3.astype('str')
    return data

def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        if col.find("FTR") >= 0: 
            output = output.join(col_data)
            continue
        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
        
    display("Processed feature columns ({} total features): {}".format(len(output.columns), list(output.columns)))
    return output

def only_hw(string, label_='H'):    
    if string == label_:
        return label_
    else:
        return 'N'

def binarize_FTR(data, label='H'):
    return data.apply(only_hw,label_=label)

#DATA EXPLORATION
def explore_data(data):
    # Total number of students.
    n_matches = data.shape[0]

    # Calculate number of features.
    n_features = data.shape[1] - 1

    # Calculate matches won by home team.
    n_homewins = len(data[data.FTR == 'H'])
    n_awaywins = len(data[data.FTR == 'A'])

    # Calculate win rate for home team.
    win_rate = (float(n_homewins) / (n_matches)) * 100
    win_away_rate = (float(n_awaywins) / (n_matches)) * 100

    # Print the results
    print("Total number of matches: {}".format(n_matches))
    print("Number of features: {}".format(n_features))
    print("Number of matches won by home team: {}".format(n_homewins))
    print("Win rate of home team: {:.2f}%".format(win_rate))
    print("Number of matches won by away team: {}".format(n_awaywins))
    print("Win rate of away team: {:.2f}%".format(win_away_rate))
    return data

# Visualising distribution of data
def scatter(data):
    scatter_matrix(data[['HTGD','ATGD','HTP','ATP','DiffFormPts','DiffLP']], figsize=(10,10))
    return data