# This Template is created to make grading fair and straightforward. Anything not in the place as mentioned in the template would not be graded.

<font color='red'> # NOTE: We would run the notebook through a Plagiarism Checker. If it is found to be copied, your work would not be graded, and the incident would be highlighted to NYU Authorities. </font>

# Import Library and Dataset

In [1]:
import numpy as np
import sklearn as sk
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve,auc,precision_recall_curve,average_precision_score,make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
#data = pd.read_csv("./qudditch_training.csv")
data = pd.read_csv("./leaderboard_training.csv")
test = pd.read_csv("./leaderboard_test.csv")

Dataset shape: (100766, 48)
Test dataset shape: (500, 47)


# PART I: Preprocessing

#### Handling missing values. (If ANY)

In [2]:
## Replace ? with NaN to handle missing values easily with pandas library
data = data.replace('?',np.NaN)
test = test.replace('?',np.NaN)

## Drop columns with too many missing values
data = data.drop(columns=['move_specialty','weight','player_code'])
test = test.drop(columns=['move_specialty','weight','player_code'])

## Drop columns unique to a player like id and player id_num
data = data.drop(columns=['player_id','id_num'])
test = test.drop(columns=['player_id','id_num'])

## Set missing values in the house to Other
data = data.fillna(value={'house':'Other'})
test = test.fillna(value={'house':'Other'})

## Set Unknown/Invalid gender to the most common gender Female
data.loc[data.gender =='Unknown/Invalid', 'gender']=data['gender'].mode()[0]
test.loc[test.gender =='Unknown/Invalid', 'gender']=test['gender'].mode()[0]

#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)

In [3]:
## Set label to 0-1 instead of No,Yes
data['quidditch_league_player'] = data['quidditch_league_player'].replace({'NO': 0, 'YES': 1})

## Binary encode gender information
data['gender'] = data['gender'].replace({'Female': 1, 'Male': 0})
test['gender'] = test['gender'].replace({'Female': 1, 'Male': 0})

## One hot encode house information
data = pd.concat([data,pd.get_dummies(data['house'], prefix='house',drop_first = True)],axis=1)
data = data.drop('house',axis=1)
test = pd.concat([test,pd.get_dummies(test['house'], prefix='house',drop_first = True)],axis=1)
test = test.drop('house',axis=1)

## One hot encode player type
data = pd.concat([data,pd.get_dummies(data['player_type'], prefix='player_type',drop_first = True)],axis=1)
data = data.drop('player_type',axis=1)
test = pd.concat([test,pd.get_dummies(test['player_type'], prefix='player_type',drop_first = True)],axis=1)
test = test.drop('player_type',axis=1)

## One hot encode game_move_id
data = pd.concat([data,pd.get_dummies(data['game_move_id'], prefix='game_move_id',drop_first = True)],axis=1)
data = data.drop('game_move_id',axis=1)
test = pd.concat([test,pd.get_dummies(test['game_move_id'], prefix='game_move_id',drop_first = True)],axis=1)
test = test.drop('game_move_id',axis=1)

## One hot encode penalty_id
data = pd.concat([data,pd.get_dummies(data['penalty_id'], prefix='penalty_id',drop_first = True)],axis=1)
data = data.drop('penalty_id',axis=1)
test = pd.concat([test,pd.get_dummies(test['penalty_id'], prefix='penalty_id',drop_first = True)],axis=1)
test = test.drop('penalty_id',axis=1)

## One hot encode foul_type_id
data = pd.concat([data,pd.get_dummies(data['foul_type_id'], prefix='foul_type_id',drop_first = True)],axis=1)
data = data.drop('foul_type_id',axis=1)
test = pd.concat([test,pd.get_dummies(test['foul_type_id'], prefix='foul_type_id',drop_first = True)],axis=1)
test = test.drop('foul_type_id',axis=1)

data['snitchnip'] = data['snitchnip'].replace({'>200': 2, '>300': 2, 'Norm': 1, 'None': 0})
test['snitchnip'] = test['snitchnip'].replace({'>200': 2, '>300': 2, 'Norm': 1, 'None': 0})

data['stooging'] = data['stooging'].replace({'>7': 2, '>8': 2, 'Norm': 1, 'None': 0})
test['stooging'] = test['stooging'].replace({'>7': 2, '>8': 2, 'Norm': 1, 'None': 0})

## Replace in the below attbs No with 1 and Steady,Up,Down as 1 as we thought of it as they are either there or not there.
attbs = ['body_blow','checking','dopplebeater_defence','no_hands_tackle','sloth_grip_roll',
       'twirl','spiral_dive','wronski_feint','zig-zag','porskoff_ploy','transylvanian_tackle' ,'woollongong_shimmy',
       'power_play','starfish_and_stick','bludger_backbeat','hawkshead_attacking_formation','chelmondiston_charge',
       'dionysus_dive','double_eight_loop','finbourgh_flick','reverse_pass','parkins_pincer','plumpton_pass']

for col in attbs:
    col_temp=col+'_temp'
    data[col_temp] = data[col].apply(lambda x: 0 if (x=='No' or x=='Steady') else 1)
    test[col_temp] = test[col].apply(lambda x: 0 if (x=='No' or x=='Steady') else 1)

data[attbs] = data[attbs].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})
test[attbs] = test[attbs].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})
    
## Replace Change and Snitch Count with their binary encoding
data['change'] = data['change'].replace({'No': 0, 'Ch': 1})
test['change'] = test['change'].replace({'No': 0, 'Ch': 1})

data['snitch_caught'] = data['snitch_caught'].replace({'No': 0, 'Yes': 1})
test['snitch_caught'] = test['snitch_caught'].replace({'No': 0, 'Yes': 1})
print("Dataset shape:", data.shape)

Dataset shape: (100766, 121)


#### Feature Reduction or extraction. (If ANY)

In [4]:
## After replacing these attbs some have almost 99% No so did not have significant information drop those columns.
new_attbs = []
for attb in attbs:
    if data[attb].sum()<100:
        col_temp=attb+'_temp'
        data = data.drop(columns=[attb])
        test = test.drop(columns=[attb])
        data = data.drop(columns=[col_temp])
        test = test.drop(columns=[col_temp])
    else:
        new_attbs.append(attb)
attbs = new_attbs
## Combine notpartof, injured and satout to have num_games_notperformed
data['num_games_notperformed'] = data['num_games_notpartof'] + data['num_games_injured'] + data['num_games_satout']
test['num_games_notperformed'] = test['num_games_notpartof'] + test['num_games_injured'] + test['num_games_satout']

data['num_tac_changes'] = 0
test['num_tac_changes'] = 0
for col in attbs:
    col_temp=col+'_temp'
    data['num_tac_changes'] = data['num_tac_changes'] + data[col_temp]
    data = data.drop(columns=[col_temp])
    test['num_tac_changes'] = test['num_tac_changes'] + test[col_temp]
    test = test.drop(columns=[col_temp])
data['num_tac_used'] = 0
test['num_tac_used'] = 0
for col in attbs:
    data['num_tac_used'] = data['num_tac_used'] + data[col]
    data = data.drop(columns=[col])
    test['num_tac_used'] = test['num_tac_used'] + test[col]
    test = test.drop(columns=[col])
data.head()

Unnamed: 0,gender,age,game_duration,num_game_moves,num_game_losses,num_practice_sessions,num_games_satout,num_games_injured,num_games_notpartof,num_games_won,snitchnip,stooging,change,snitch_caught,quidditch_league_player,house_Hufflepuff,house_Other,house_Ravenclaw,house_Slytherin,player_type_Beater2,player_type_Captain,player_type_Chaser1,player_type_Chaser2,player_type_Chaser3,player_type_Keeper,player_type_Multiple,player_type_Seeker,game_move_id_2,game_move_id_3,game_move_id_4,game_move_id_5,game_move_id_6,game_move_id_7,game_move_id_8,game_move_id_9,game_move_id_10,game_move_id_11,game_move_id_12,game_move_id_13,game_move_id_14,game_move_id_15,game_move_id_16,game_move_id_17,game_move_id_18,game_move_id_19,game_move_id_20,game_move_id_22,game_move_id_23,game_move_id_24,game_move_id_25,game_move_id_27,game_move_id_28,penalty_id_2,penalty_id_3,penalty_id_4,penalty_id_5,penalty_id_6,penalty_id_7,penalty_id_8,penalty_id_9,penalty_id_10,penalty_id_11,penalty_id_13,penalty_id_14,penalty_id_17,penalty_id_20,penalty_id_22,penalty_id_25,foul_type_id_2,foul_type_id_3,foul_type_id_4,foul_type_id_5,foul_type_id_6,foul_type_id_7,foul_type_id_8,num_games_notperformed,num_tac_changes,num_tac_used
0,1,11.0,1,41,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1,12.0,3,59,0,18,0,0,0,9,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,1,13.0,2,11,5,13,2,0,1,6,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1
3,0,14.0,2,44,1,16,0,0,0,7,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,0,14.5,1,51,0,8,0,0,0,5,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2


#### Any other Pre-processing Used. (Give the name along with the code.)

In [9]:
from scipy import stats

num_col = ['age', 'game_duration', 'num_game_moves', 'num_game_losses', 'num_practice_sessions', 'num_games_won',\
           'num_games_notpartof', 'num_games_injured', 'num_games_satout', 'num_games_notperformed', 'num_tac_changes',\
           'num_tac_used']
for col in num_col:
    skew_val = data[col].skew()
    kurt_val = data[col].kurtosis()
    if (abs(skew_val) >2) & (abs(kurt_val) >2):
        if len(data[data[col] == 0])/len(data) <=0.02:
            data = data[data[col] > 0]
            data[col] = np.log(data[col])
            #test[col] = np.log(test[col])
            test[col] = np.log(test[col])
        else:
            data = data[data[col] >= 0]
            data[col] = np.log1p(data[col])
            #test[col] = np.log1p(test[col])
            test[col] = np.log1p(test[col])

# Standardize numeric columns
m = np.mean(data[num_col], axis=0)
std = np.std(data[num_col], axis=0)
data[num_col] = (data[num_col] - m)/std
#test[num_col] = (test[num_col] - np.mean(test[num_col], axis=0))/np.std(test[num_col], axis=0)
test[num_col] = (test[num_col] - m)/std

#Remove outliers
data = data[(np.abs(stats.zscore(data[num_col])) < 3).all(axis=1)]

#Reduction of feature that has more than 80% co-relation with any other feature
data_temp = data.drop(['quidditch_league_player'], axis=1)
corr_matrix = data_temp.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]
data = data.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)
print("Dataset shape:", data.shape)

In [38]:
data_columns = list(data.columns)
data_columns.remove('quidditch_league_player')
test_columns = list(test.columns)
if data_columns == test_columns:
    print ("All columns are there")
else:
    for tc in test_columns:
        data_columns.remove(tc)
    print ("There are some missing columns",data_columns)

    for missing in data_columns:
        test[missing]=0

data = data.sort_index(axis=1)
test = test.sort_index(axis=1)

All columns are there


#### Scale the data

In [9]:
y = data['quidditch_league_player']
X = data.drop('quidditch_league_player',axis=1)
X_test = test
scaler = StandardScaler()
scaler_all = StandardScaler()

#### Split train val and subsample

In [41]:
## Split train and validation
n = len(data)
split = 0.2
data_indices = np.arange(n)
np.random.shuffle(data_indices)
val_indices = data_indices[0:int(n*split)]
train_indices = data_indices[int(n*split):]
data_val = data.iloc[val_indices]
data_train = data.iloc[train_indices]
## Subsample so label 1 and label 0 has same number of instances
rows_pos = data_train.quidditch_league_player == 1
df_train_pos = data_train.loc[rows_pos]
df_train_neg = data_train.loc[~rows_pos]
neg_idx = np.arange(df_train_neg.shape[0])
np.random.shuffle(neg_idx)
sampled_idx = neg_idx[:len(df_train_pos)]
data_train_sub = pd.concat([df_train_pos, df_train_neg.iloc[sampled_idx]],axis = 0)

## Also subsample whole data like this, to be used on training the best model 
## after validation is done to pick the best hyperparameter. This way we use more data for test set and still 
## pick hyperparameters on unseen data.
rows_pos = data.quidditch_league_player == 1
df_pos = data.loc[rows_pos]
df_neg = data.loc[~rows_pos]
neg_idx = np.arange(df_neg.shape[0])
np.random.shuffle(neg_idx)
sampled_idx = neg_idx[:len(df_pos)]
data_all = pd.concat([df_pos, df_neg.iloc[sampled_idx]],axis = 0)

## Scale data

In [42]:
y_train = data_train_sub['quidditch_league_player']
X_train = data_train_sub.drop('quidditch_league_player',axis=1)
X_train = scaler.fit_transform(X_train)

y_val = data_val['quidditch_league_player']
X_val = data_val.drop('quidditch_league_player',axis=1)
X_val = scaler.transform(X_val)

y_all = data_all['quidditch_league_player']
X_all = data_all.drop('quidditch_league_player',axis=1)
X_all = scaler_all.fit_transform(X_all)

X_test = scaler_all.transform(test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  import sys
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  del sys.path[0]


# PART II: Classification

### Model 1:
Model Name: Logistic Regression<br>
Evaluation method and metric used Name: Train-validation split, validation accuracy<br>
Name of the Hyperparameter used: C,regularization term<br>

In [23]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(C=1.,solver='newton-cg',penalty='l2')

C = [0.0001,0.001, 0.01, 0.1, 1, 10, 100, 1000,10000]

random_grid = {'C':C}

rs_cv = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 20, cv = 2, scoring=make_scorer(f1_score))
rs_cv.fit(X_train, y_train)
best_clf = rs_cv.best_estimator_
y_pred = best_clf.predict(X_val)
print("Accuracy is {0:.3f}".format(accuracy_score(y_val, y_pred)))
print("F1 is {0:.3f}".format(f1_score(y_val, y_pred)))
print ("Confusion matrix")
print (confusion_matrix(y_val, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    4.1s finished


Accuracy is 0.664
F1 is 0.272
Confusion matrix
[[12116  5737]
 [ 1036  1264]]


### Model 2:
Model Name: MLP Classifier<br>
Evaluation method and metric used Name: Train-validation split, accuracy score<br>
Name of the Hyperparameter used: Hidden layer sizes,alpha

In [26]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(3,3), activation='relu', solver='adam', alpha=1e-2,max_iter=1000)
alpha = [1e-4,1e-3, 1e-2, 1e-1, 1, 10]
solver = ['lbfgs','sgd','adam']
activation = ['logistic','tanh','relu']
hidden_layer_sizes = [(3,3),(5,5),(7,7),(9,9),(13,13),(20,20)]

random_grid = {'alpha':alpha,
               'solver':solver,
               'activation':activation,
               'hidden_layer_sizes':hidden_layer_sizes}

rs_cv = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                           n_iter = 20, cv = 2, scoring=make_scorer(f1_score))
rs_cv.fit(X_train, y_train)
best_clf = rs_cv.best_estimator_
y_pred = best_clf.predict(X_val)
print("Accuracy is {0:.3f}".format(accuracy_score(y_val, y_pred)))
print("F1 is {0:.3f}".format(f1_score(y_val, y_pred)))
print ("Confusion matrix")
print (confusion_matrix(y_val, y_pred))

Accuracy is 0.603
F1 is 0.266
Confusion matrix
[[10699  7154]
 [  847  1453]]


### Model 3:
Model Name: Random Forests<br>
Evaluation method and metric used Name: Train-validation split, validation accuracy <br>
Name of the Hyperparameter used: max_depth,n_estimators,criterion<br>

In [27]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100,max_depth=6, criterion = "gini", min_samples_split=6)
n_estimators = range(200,1000,200)
max_depth = range(1,20,1)
min_samples_split = range(2,10,2)
criterion = ['gini','entropy']

random_grid = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'criterion':criterion}

rs_cv = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                           n_iter = 20, cv = 2, scoring=make_scorer(f1_score))

rs_cv.fit(X_train, y_train)
best_clf = rs_cv.best_estimator_
y_pred = best_clf.predict(X_val)
print("Accuracy is {0:.3f}".format(accuracy_score(y_val, y_pred)))
print("F1 is {0:.3f}".format(f1_score(y_val, y_pred)))
print ("Confusion matrix")
print (confusion_matrix(y_val, y_pred))

Accuracy is 0.633
F1 is 0.275
Confusion matrix
[[11360  6493]
 [  899  1401]]


# PART III: Best Hypothesis:
Model Name:Random Forests<br>
Reason:Has the highest f1 score<br>
Hyper-parameter Value:## CLF HERE ##<br>

In [48]:
## Fit the best classifier for the all subsampled dataset
best_clf.fit(X_all,y_all)

## Predict and upload test data
test_result = pd.DataFrame()
test_result['id_num']=pd.read_csv("./leaderboard_test.csv")['id_num']
test_result['quidditch_league_player'] = best_clf.predict(X_test)
test_result['quidditch_league_player'] = test_result['quidditch_league_player'].replace({1:'YES',0:'NO'})
test_result.to_csv('test_outputs.csv', sep=',',index=False)