In [93]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2
import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
# Compare normalized, standardized 

train_data = pd.read_csv('matchestrain.csv')
train_data.drop(columns=['Unnamed: 0', 'id', 'date', 'umpire1','umpire2','umpire3'], axis = 1, inplace=True)
train_data.dropna(subset=['city'],inplace=True)
train_data = train_data.replace(['Chennai Super Kings','Kolkata Knight Riders', 'Delhi Capitals', 'Kings XI Punjab',
                                     'Mumbai Indians', 'Sunrisers Hyderabad', 'Rajasthan Royals', 'Royal Challengers Bangalore',
                                    'Pune Warriors','Gujarat Lions', 'Rising Pune Supergiants', 'Deccan Chargers',
                                     'Kochi Tuskers Kerala', np.NaN],
                                    [0,1,2,3,4,5,6,7,8,9,10,11,12,-1], )
train_target = train_data.pop('winner')

city = ['Hyderabad', 'Pune', 'Rajkot', 'Indore', 'Mumbai', 'Kolkata','Bangalore', 'Delhi', 'Chandigarh', 'Kanpur','Jaipur',
        'Chennai','Cape Town','Port Elizabeth','Durban','Centurion','East London','Johannesburg','Kimberley','Bloemfontein',
       'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala', 'Kochi','Visakhapatnam', 'Raipur', 'Ranchi', 'Abu Dhabi', 'Sharjah',
       'Mohali', 'Bengaluru']
train_data = train_data.replace( city, np.arange(len(city)))
train_data = train_data.replace( ['field', 'bat'],np.arange(2))
train_data = train_data.replace( ['normal', 'tie', 'no result'],np.arange(3))
player_of_match = ['Yuvraj Singh', 'SPD Smith', 'CA Lynn', 'GJ Maxwell', 'KM Jadhav','Rashid Khan', 'N Rana', 'AR Patel',
        'SV Samson', 'JJ Bumrah', 'SP Narine', 'KA Pollard','AJ Tye','RV Uthappa','CJ Anderson','BA Stokes', 'NM Coulter-Nile',
        'B Kumar', 'CH Gayle','KS Williamson', 'JC Buttler', 'SK Raina', 'MJ McClenaghan','MS Dhoni', 'HM Amla', 'G Gambhir',
        'LH Ferguson', 'KH Pandya','Sandeep Sharma', 'DA Warner', 'RG Sharma', 'Mohammed Shami','RA Tripathi', 'RR Pant',
        'JD Unadkat', 'LMP Simmons', 'DR Smith','S Dhawan', 'MM Sharma', 'SS Iyer', 'WP Saha', 'KK Nair','Mohammed Siraj',
        'AT Rayudu', 'HV Patel', 'Washington Sundar','KV Sharma', 'BB McCullum', 'MEK Hussey', 'MF Maharoof','MV Boucher', 
        'DJ Hussey', 'SR Watson', 'V Sehwag', 'ML Hayden','YK Pathan', 'KC Sangakkara', 'JDP Oram', 'AC Gilchrist',
        'SM Katich', 'ST Jayasuriya', 'GD McGrath', 'SE Marsh','SA Asnodkar', 'R Vinay Kumar', 'IK Pathan', 'SM Pollock',
        'Sohail Tanvir', 'S Sreesanth', 'A Nehra', 'SC Ganguly','CRD Fernando', 'L Balaji', 'Shoaib Akhtar', 'A Mishra',
        'DPMD Jayawardene', 'GC Smith', 'DJ Bravo', 'M Ntini', 'SP Goswami', 'A Kumble', 'KD Karthik', 'JA Morkel', 'P Kumar',
        'Umar Gul', 'SR Tendulkar', 'R Dravid', 'DL Vettori', 'RP Singh', 'M Muralitharan', 'AB de Villiers', 'RS Bopara',
        'PP Ojha','TM Dilshan', 'HH Gibbs', 'DP Nannes', 'JP Duminy', 'SB Jakati', 'JH Kallis','A Singh', 'S Badrinath','LRPL Taylor',
        'Harbhajan Singh', 'R Bhatia', 'SK Warne', 'B Lee', 'BJ Hodge','LR Shukla', 'MK Pandey', 'AD Mathews', 'MK Tiwary', 
        'WPUJC Vaas','A Symonds', 'AA Jhunjhunwala', 'J Theron', 'AC Voges', 'NV Ojha','SL Malinga', 'M Vijay', 'KP Pietersen',
        'PD Collingwood','MJ Lumb', 'TL Suman', 'RJ Harris', 'PP Chawla', 'Harmeet Singh','R Ashwin', 'R McLaren', 'M Kartik',
        'DE Bollinger', 'S Anirudha','SK Trivedi', 'SB Wagh', 'PC Valthaty', 'MD Mishra', 'DW Steyn','S Sohal', 'MM Patel', 
        'V Kohli', 'I Sharma', 'J Botha','Iqbal Abdulla', 'P Parameswaran', 'R Sharma', 'MR Marsh','BA Bhatt', 'S Aravind', -1,
        'JEC Franklin', 'RE Levi','AM Rahane', 'RA Jadeja', 'MN Samuels', 'M Morkel', 'F du Plessis','AD Mascarenhas', 
        'Shakib Al Hasan', 'JD Ryder', 'S Nadeem','KMDN Kulasekara', 'CL White', 'Mandeep Singh', 'P Negi','Azhar Mahmood', 
        'BW Hilfenhaus', 'A Chandila', 'UT Yadav','MS Bisla', 'M Vohra', 'GH Vihari', 'AJ Finch', 'JP Faulkner',
        'MS Gony', 'DA Miller', 'DJG Sammy', 'MG Johnson', 'KK Cooper','PA Patel', 'AP Tare', 'LJ Wright', 'YS Chahal', 
        'PV Tambe','DJ Hooda', 'GJ Bailey', 'AD Russell', 'MA Agarwal', 'MA Starc','VR Aaron', 'TA Boult', 'EJG Morgan',
        'HH Pandya', 'MC Henriques','Z Khan', 'Q de Kock', 'Mustafizur Rahman', 'SA Yadav', 'AB Dinda','CH Morris', 
        'CR Brathwaite', 'MP Stoinis', 'A Zampa','BCJ Cutting', 'KL Rahul', 'SW Billings', 'JJ Roy', 'B Stanlake','J Archer', 
        'AS Rajpoot', 'TG Southee', 'AS Yadav', 'M Ur Rahman','Ishan Kishan', 'Kuldeep Yadav', 'S Gopal', 'L Ngidi']
train_data = train_data.replace( player_of_match,np.arange(len(player_of_match)))

venue =['Rajiv Gandhi Intl. Cricket Stadium','Maharashtra Cricket Association Stadium','Saurashtra Cricket Association Stadium',
        'Holkar Cricket Stadium','M. Chinnaswamy Stadium', 'Wankhede Stadium', 'Eden Gardens','Feroz Shah Kotla Ground',
       'Punjab Cricket Association IS Bindra Stadium, Mohali','Green Park', 'Sawai Mansingh Stadium',
       'MA Chidambaram Stadium, Chepauk', 'Dr DY Patil Sports Academy','Newlands', "St George's Park", 'Kingsmead', 
        'SuperSport Park', 'Buffalo Park', 'New Wanderers Stadium', 'De Beers Diamond Oval','OUTsurance Oval', 
        'Brabourne Stadium','Sardar Patel Stadium, Motera', 'Barabati Stadium','Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium','ACA-VDCA Stadium',
        'Shaheed Veer Narayan Singh International Stadium','JSCA International Stadium Complex', 'Sheikh Zayed Stadium',
       'Sharjah Cricket Stadium']
train_data = train_data.replace( venue,np.arange(len(venue)))

X_train, X_test, Y_train, Y_test = train_test_split( train_data, train_target, stratify = train_target,
                                                    random_state = 30, test_size = 0.3)

print('SVC')
svm_classifier = SVC()
svm_classifier = svm_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', svm_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', svm_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, svm_classifier.predict(X_test), average='macro'))
print()

print('Gaussian')
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', nb_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', nb_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, nb_classifier.predict(X_test), average='macro'))
print()

print('knn')
knn = KNeighborsClassifier()
knn = knn.fit(X_train, Y_train)
print('Accuracy of Train Data :', knn.score(X_train,Y_train))
print('Accuracy of Test Data :', knn.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, knn.predict(X_test), average='macro'))
print()

print('Decision Tree')
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt.score(X_train,Y_train))
print('Accuracy of Test Data :', dt.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

print()
print('\tNormalized')

normalizer = preprocessing.Normalizer()
normalizer = normalizer.fit(train_data)
train_data = normalizer.transform(train_data)

X_train, X_test, Y_train, Y_test = train_test_split( train_data, train_target, stratify = train_target,
                                                    random_state = 30, test_size = 0.3)

print('SVC')
svm_classifier = SVC()
svm_classifier = svm_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', svm_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', svm_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, svm_classifier.predict(X_test), average='macro'))
print()

print('Gaussian')
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', nb_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', nb_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, nb_classifier.predict(X_test), average='macro'))
print()

print('knn')
knn = KNeighborsClassifier()
knn = knn.fit(X_train, Y_train)
print('Accuracy of Train Data :', knn.score(X_train,Y_train))
print('Accuracy of Test Data :', knn.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, knn.predict(X_test), average='macro'))
print()

print('Decision Tree')
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt.score(X_train,Y_train))
print('Accuracy of Test Data :', dt.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

print()
print('\tStandardized')
standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(train_data)
train_data = standardizer.transform(train_data)

X_train, X_test, Y_train, Y_test = train_test_split( train_data, train_target, stratify = train_target,
                                                    random_state = 30, test_size = 0.3)

print('SVC')
svm_classifier = SVC()
svm_classifier = svm_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', svm_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', svm_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, svm_classifier.predict(X_test), average='macro'))
print()

print('Gaussian')
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', nb_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', nb_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, nb_classifier.predict(X_test), average='macro'))
print()

print('knn')
knn = KNeighborsClassifier()
knn = knn.fit(X_train, Y_train)
print('Accuracy of Train Data :', knn.score(X_train,Y_train))
print('Accuracy of Test Data :', knn.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, knn.predict(X_test), average='macro'))
print()

print('Decision Tree')
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt.score(X_train,Y_train))
print('Accuracy of Test Data :', dt.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

SVC
Accuracy of Train Data : 0.14315352697095435
Accuracy of Test Data : 0.14009661835748793
F1 Score is  0.0175544794188862

Gaussian
Accuracy of Train Data : 0.1970954356846473
Accuracy of Test Data : 0.21256038647342995
F1 Score is  0.15728493107994748

knn
Accuracy of Train Data : 0.4896265560165975
Accuracy of Test Data : 0.23671497584541062
F1 Score is  0.19202678148991917

Decision Tree
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.8454106280193237
F1 Score is  0.6979945231906965

Random Forest
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.8019323671497585
F1 Score is  0.6510102664990646

	Normalized
SVC
Accuracy of Train Data : 0.14315352697095435
Accuracy of Test Data : 0.14009661835748793
F1 Score is  0.0175544794188862

Gaussian
Accuracy of Train Data : 0.15560165975103735
Accuracy of Test Data : 0.15942028985507245
F1 Score is  0.09706860754383471

knn
Accuracy of Train Data : 0.475103734439834
Accuracy of Test Data : 0.2463768115942029
F1 Score is  0.2006

In [35]:
# Normal Vs Standardized

train_data = pd.read_csv('matchestrain.csv')
to_drop = ['Unnamed: 0', 'id', 'date', 'umpire1','umpire2','umpire3','player_of_match']
train_data.drop(columns= to_drop, axis = 1, inplace=True)
train_data.dropna(subset=['city'],inplace=True)
train_data = train_data.replace(['Chennai Super Kings','Kolkata Knight Riders', 'Delhi Capitals', 'Kings XI Punjab',
                                     'Mumbai Indians', 'Sunrisers Hyderabad', 'Rajasthan Royals', 'Royal Challengers Bangalore',
                                    'Pune Warriors','Gujarat Lions', 'Rising Pune Supergiants', 'Deccan Chargers',
                                     'Kochi Tuskers Kerala', np.NaN],
                                    [0,1,2,3,4,5,6,7,8,9,10,11,12,-1], )
train_target = train_data.pop('winner')

city = ['Hyderabad', 'Pune', 'Rajkot', 'Indore', 'Mumbai', 'Kolkata','Bangalore', 'Delhi', 'Chandigarh', 'Kanpur','Jaipur',
        'Chennai','Cape Town','Port Elizabeth','Durban','Centurion','East London','Johannesburg','Kimberley','Bloemfontein',
       'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala', 'Kochi','Visakhapatnam', 'Raipur', 'Ranchi', 'Abu Dhabi', 'Sharjah',
       'Mohali', 'Bengaluru']
train_data = train_data.replace( city, np.arange(len(city)))
train_data = train_data.replace( ['field', 'bat'],np.arange(2))
train_data = train_data.replace( ['normal', 'tie', 'no result'],np.arange(3))
player_of_match = ['Yuvraj Singh', 'SPD Smith', 'CA Lynn', 'GJ Maxwell', 'KM Jadhav','Rashid Khan', 'N Rana', 'AR Patel',
        'SV Samson', 'JJ Bumrah', 'SP Narine', 'KA Pollard','AJ Tye','RV Uthappa','CJ Anderson','BA Stokes', 'NM Coulter-Nile',
        'B Kumar', 'CH Gayle','KS Williamson', 'JC Buttler', 'SK Raina', 'MJ McClenaghan','MS Dhoni', 'HM Amla', 'G Gambhir',
        'LH Ferguson', 'KH Pandya','Sandeep Sharma', 'DA Warner', 'RG Sharma', 'Mohammed Shami','RA Tripathi', 'RR Pant',
        'JD Unadkat', 'LMP Simmons', 'DR Smith','S Dhawan', 'MM Sharma', 'SS Iyer', 'WP Saha', 'KK Nair','Mohammed Siraj',
        'AT Rayudu', 'HV Patel', 'Washington Sundar','KV Sharma', 'BB McCullum', 'MEK Hussey', 'MF Maharoof','MV Boucher', 
        'DJ Hussey', 'SR Watson', 'V Sehwag', 'ML Hayden','YK Pathan', 'KC Sangakkara', 'JDP Oram', 'AC Gilchrist',
        'SM Katich', 'ST Jayasuriya', 'GD McGrath', 'SE Marsh','SA Asnodkar', 'R Vinay Kumar', 'IK Pathan', 'SM Pollock',
        'Sohail Tanvir', 'S Sreesanth', 'A Nehra', 'SC Ganguly','CRD Fernando', 'L Balaji', 'Shoaib Akhtar', 'A Mishra',
        'DPMD Jayawardene', 'GC Smith', 'DJ Bravo', 'M Ntini', 'SP Goswami', 'A Kumble', 'KD Karthik', 'JA Morkel', 'P Kumar',
        'Umar Gul', 'SR Tendulkar', 'R Dravid', 'DL Vettori', 'RP Singh', 'M Muralitharan', 'AB de Villiers', 'RS Bopara',
        'PP Ojha','TM Dilshan', 'HH Gibbs', 'DP Nannes', 'JP Duminy', 'SB Jakati', 'JH Kallis','A Singh', 'S Badrinath','LRPL Taylor',
        'Harbhajan Singh', 'R Bhatia', 'SK Warne', 'B Lee', 'BJ Hodge','LR Shukla', 'MK Pandey', 'AD Mathews', 'MK Tiwary', 
        'WPUJC Vaas','A Symonds', 'AA Jhunjhunwala', 'J Theron', 'AC Voges', 'NV Ojha','SL Malinga', 'M Vijay', 'KP Pietersen',
        'PD Collingwood','MJ Lumb', 'TL Suman', 'RJ Harris', 'PP Chawla', 'Harmeet Singh','R Ashwin', 'R McLaren', 'M Kartik',
        'DE Bollinger', 'S Anirudha','SK Trivedi', 'SB Wagh', 'PC Valthaty', 'MD Mishra', 'DW Steyn','S Sohal', 'MM Patel', 
        'V Kohli', 'I Sharma', 'J Botha','Iqbal Abdulla', 'P Parameswaran', 'R Sharma', 'MR Marsh','BA Bhatt', 'S Aravind', -1,
        'JEC Franklin', 'RE Levi','AM Rahane', 'RA Jadeja', 'MN Samuels', 'M Morkel', 'F du Plessis','AD Mascarenhas', 
        'Shakib Al Hasan', 'JD Ryder', 'S Nadeem','KMDN Kulasekara', 'CL White', 'Mandeep Singh', 'P Negi','Azhar Mahmood', 
        'BW Hilfenhaus', 'A Chandila', 'UT Yadav','MS Bisla', 'M Vohra', 'GH Vihari', 'AJ Finch', 'JP Faulkner',
        'MS Gony', 'DA Miller', 'DJG Sammy', 'MG Johnson', 'KK Cooper','PA Patel', 'AP Tare', 'LJ Wright', 'YS Chahal', 
        'PV Tambe','DJ Hooda', 'GJ Bailey', 'AD Russell', 'MA Agarwal', 'MA Starc','VR Aaron', 'TA Boult', 'EJG Morgan',
        'HH Pandya', 'MC Henriques','Z Khan', 'Q de Kock', 'Mustafizur Rahman', 'SA Yadav', 'AB Dinda','CH Morris', 
        'CR Brathwaite', 'MP Stoinis', 'A Zampa','BCJ Cutting', 'KL Rahul', 'SW Billings', 'JJ Roy', 'B Stanlake','J Archer', 
        'AS Rajpoot', 'TG Southee', 'AS Yadav', 'M Ur Rahman','Ishan Kishan', 'Kuldeep Yadav', 'S Gopal', 'L Ngidi']
train_data = train_data.replace( player_of_match,np.arange(len(player_of_match)))

venue =['Rajiv Gandhi Intl. Cricket Stadium','Maharashtra Cricket Association Stadium','Saurashtra Cricket Association Stadium',
        'Holkar Cricket Stadium','M. Chinnaswamy Stadium', 'Wankhede Stadium', 'Eden Gardens','Feroz Shah Kotla Ground',
       'Punjab Cricket Association IS Bindra Stadium, Mohali','Green Park', 'Sawai Mansingh Stadium',
       'MA Chidambaram Stadium, Chepauk', 'Dr DY Patil Sports Academy','Newlands', "St George's Park", 'Kingsmead', 
        'SuperSport Park', 'Buffalo Park', 'New Wanderers Stadium', 'De Beers Diamond Oval','OUTsurance Oval', 
        'Brabourne Stadium','Sardar Patel Stadium, Motera', 'Barabati Stadium','Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium','ACA-VDCA Stadium',
        'Shaheed Veer Narayan Singh International Stadium','JSCA International Stadium Complex', 'Sheikh Zayed Stadium',
       'Sharjah Cricket Stadium']
train_data = train_data.replace( venue,np.arange(len(venue)))

X_train, X_test, Y_train, Y_test = train_test_split( train_data, train_target, stratify = train_target,
                                                    random_state = 30, test_size = 0.3)

print('Decision Tree')
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt.score(X_train,Y_train))
print('Accuracy of Test Data :', dt.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

print()
print('\n\t\tScaled')

standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(train_data)
train_data = standardizer.transform(train_data)

X_train, X_test, Y_train, Y_test = train_test_split( train_data, train_target, stratify = train_target,
                                                    random_state = 30, test_size = 0.3)

print('Decision Tree')
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt.score(X_train,Y_train))
print('Accuracy of Test Data :', dt.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

Decision Tree
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.8260869565217391
F1 Score is  0.7374575275300665

Random Forest
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.7681159420289855
F1 Score is  0.6077491298744382


		Scaled
Decision Tree
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.8309178743961353
F1 Score is  0.7222406025838601

Random Forest
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.7584541062801933
F1 Score is  0.5975711527631492


In [144]:
# Variation of of Normal Vs Standard

train_data = pd.read_csv('matchestrain.csv')
to_drop = ['Unnamed: 0', 'id', 'date', 'umpire1','umpire2','umpire3','player_of_match']
train_data.drop(columns= to_drop, axis = 1, inplace=True)
train_data.dropna(subset=['city'],inplace=True)
train_data = train_data.replace(['Chennai Super Kings','Kolkata Knight Riders', 'Delhi Capitals', 'Kings XI Punjab',
                                     'Mumbai Indians', 'Sunrisers Hyderabad', 'Rajasthan Royals', 'Royal Challengers Bangalore',
                                    'Pune Warriors','Gujarat Lions', 'Rising Pune Supergiants', 'Deccan Chargers',
                                     'Kochi Tuskers Kerala', np.NaN],
                                    [0,1,2,3,4,5,6,7,8,9,10,11,12,-1], )
train_target = train_data.pop('winner')

city = ['Hyderabad', 'Pune', 'Rajkot', 'Indore', 'Mumbai', 'Kolkata','Bangalore', 'Delhi', 'Chandigarh', 'Kanpur','Jaipur',
        'Chennai','Cape Town','Port Elizabeth','Durban','Centurion','East London','Johannesburg','Kimberley','Bloemfontein',
       'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala', 'Kochi','Visakhapatnam', 'Raipur', 'Ranchi', 'Abu Dhabi', 'Sharjah',
       'Mohali', 'Bengaluru']
train_data = train_data.replace( city, np.arange(len(city)))
train_data = train_data.replace( ['field', 'bat'],np.arange(2))
train_data = train_data.replace( ['normal', 'tie', 'no result'],np.arange(3))
train_data = train_data.replace( player_of_match,np.arange(len(player_of_match)))
venue =['Rajiv Gandhi Intl. Cricket Stadium','Maharashtra Cricket Association Stadium','Saurashtra Cricket Association Stadium',
        'Holkar Cricket Stadium','M. Chinnaswamy Stadium', 'Wankhede Stadium', 'Eden Gardens','Feroz Shah Kotla Ground',
       'Punjab Cricket Association IS Bindra Stadium, Mohali','Green Park', 'Sawai Mansingh Stadium',
       'MA Chidambaram Stadium, Chepauk', 'Dr DY Patil Sports Academy','Newlands', "St George's Park", 'Kingsmead', 
        'SuperSport Park', 'Buffalo Park', 'New Wanderers Stadium', 'De Beers Diamond Oval','OUTsurance Oval', 
        'Brabourne Stadium','Sardar Patel Stadium, Motera', 'Barabati Stadium','Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium','ACA-VDCA Stadium',
        'Shaheed Veer Narayan Singh International Stadium','JSCA International Stadium Complex', 'Sheikh Zayed Stadium',
       'Sharjah Cricket Stadium']
train_data = train_data.replace( venue,np.arange(len(venue)))

X_train, X_test, Y_train, Y_test = train_test_split( train_data, train_target, stratify = train_target,
                                                    random_state = 30, test_size = 0.3)

print('Decision Tree')
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt.score(X_train,Y_train))
print('Accuracy of Test Data :', dt.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

print()
print('\n\t\tScaled')

standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(train_data)
train_data = standardizer.transform(train_data)

X_train, X_test, Y_train, Y_test = train_test_split( train_data, train_target, stratify = train_target,
                                                    random_state = 30)

print('Decision Tree')
dt_scaled = DecisionTreeClassifier()
dt_scaled = dt_scaled.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt_scaled.score(X_train,Y_train))
print('Accuracy of Test Data :', dt_scaled.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt_scaled.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

Decision Tree
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.8164251207729468
F1 Score is  0.7221152791286701

Random Forest
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.7729468599033816
F1 Score is  0.6255300754711282


		Scaled
Decision Tree
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.8265895953757225
F1 Score is  0.7110453401961384

Random Forest
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.7630057803468208
F1 Score is  0.6086035991797197


In [158]:
# Categoricals
train_data = pd.read_csv('matchestrain.csv')
to_drop = ['Unnamed: 0', 'id', 'date', 'umpire1','umpire2','umpire3','player_of_match']
# to_drop = ['Unnamed: 0', 'id', 'date', 'dl_applied','umpire1','umpire2','umpire3','player_of_match','result']

train_data.drop(columns= to_drop, axis = 1, inplace=True)
train_data.dropna(subset=['city'],inplace=True)

train_target = train_data.pop('winner')
train_target = train_target.replace(['Chennai Super Kings','Kolkata Knight Riders', 'Delhi Capitals', 'Kings XI Punjab',
                                     'Mumbai Indians', 'Sunrisers Hyderabad', 'Rajasthan Royals', 'Royal Challengers Bangalore',
                                    'Pune Warriors','Gujarat Lions', 'Rising Pune Supergiants', 'Deccan Chargers',
                                     'Kochi Tuskers Kerala', np.NaN],
                                    [0,1,2,3,4,5,6,7,8,9,10,11,12,-1], )

train_data_dummies = pd.get_dummies(train_data)

X_train, X_test, Y_train, Y_test = train_test_split( train_data_dummies, train_target, #stratify = train_target,
                                                    random_state = 30)


# fs = SelectKBest(score_func=chi2, k='all')
# fs.fit(X_train, Y_train)
# X_train_fs = fs.transform(X_train)
# X_test_fs = fs.transform(X_test)
# for i in range(len(fs.scores_)):
#     if fs.scores_[i] > 50:
#         print(train_data_dummies.columns[i])
# plot the scores
# plt.bar([i for i in range(len(fs.scores_))], fs.scores_)

print('Decision Tree')
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,Y_train)
print('Accuracy of Train Data :', dt.score(X_train,Y_train))
print('Accuracy of Test Data :', dt.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, dt.predict(X_test), average='macro'))
print()

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

Decision Tree
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.7861271676300579
F1 Score is  0.6543545276068073

Random Forest
Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.8439306358381503
F1 Score is  0.7164761954741714


In [229]:
# Final Answer
train_data = pd.read_csv('matchestrain.csv')
to_drop = ['Unnamed: 0', 'id', 'season', 'date', 'dl_applied','umpire1','umpire2','umpire3','player_of_match','result', 'venue']
train_data.drop(columns= to_drop, axis = 1, inplace=True)
train_data.dropna(subset=['city'],inplace=True)
train_target = train_data.pop('winner')
train_target = train_target.replace(['Chennai Super Kings','Kolkata Knight Riders', 'Delhi Capitals', 'Kings XI Punjab',
                                     'Mumbai Indians', 'Sunrisers Hyderabad', 'Rajasthan Royals', 'Royal Challengers Bangalore',
                                    'Pune Warriors','Gujarat Lions', 'Rising Pune Supergiants', 'Deccan Chargers',
                                     'Kochi Tuskers Kerala', np.NaN],
                                    [0,1,2,3,4,5,6,7,8,9,10,11,12,-1] )
train_data = train_data.replace('Bengaluru','Bangalore')


test_data = pd.read_csv('matchestest.csv')
to_drop = ['id', 'season', 'date', 'dl_applied','umpire1','umpire2','umpire3','player_of_match','result', 'venue']
test_data.drop(columns= to_drop, axis = 1, inplace=True)
test_data.dropna(subset=['city'],inplace=True)
test_data = test_data.replace('Bengaluru','Bangalore')


train_data = pd.concat([train_data, test_data])

minmax = preprocessing.MinMaxScaler()
minmax = minmax.fit(train_data[['win_by_runs','win_by_wickets']])
minmax = minmax.transform(train_data[['win_by_runs','win_by_wickets']])
train_data[['win_by_runs','win_by_wickets']] = minmax

train_data_dummies = pd.get_dummies(train_data)

test_data_dummies = train_data_dummies[689:]
train_data_dummies = train_data_dummies[:689]

X_train, X_test, Y_train, Y_test = train_test_split( train_data_dummies, train_target, stratify = train_target,
                                                    random_state = 30)

print('Random Forest')
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,Y_train)
print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))
print('F1 Score is ',f1_score(Y_test, rf_classifier.predict(X_test), average='macro'))

Random Forest
Accuracy of Train Data : 0.998062015503876
Accuracy of Test Data : 0.9132947976878613
F1 Score is  0.7973974043722348


In [230]:
ans = rf_classifier.predict(test_data_dummies)
print('id, winner')
for i in range(len(ans)):
    print('{}, {}'.format(i, ans[i]))

id, winner
0, 0
1, 1
2, 2
3, 3
4, 0
5, 1
6, 4
7, 5
8, 3
9, 2
10, 5
11, 0
12, 3
13, 6
14, 4
15, 5
16, 1
17, 0
18, 5
19, 2
20, 1
21, 3
22, 0
23, 4
24, 0
25, 2
26, 6
27, 7
28, 0
29, 5
30, 4
31, 3
32, 5
33, 4
34, 1
35, 6
36, 2
37, 5
38, 7
39, 2
40, 0
41, 7
42, 6
43, 4
44, 6
45, 2
46, 1
47, 5
48, 6
49, 0
50, 4
51, 1
52, 2
53, 7
54, 3
55, 4
56, 4
57, 2
58, 0
59, 4


In [225]:
# fs = SelectKBest(score_func=chi2, k='all')
# fs.fit(X_train, Y_train)
# X_train_fs = fs.transform(X_train)
# X_test_fs = fs.transform(X_test)
# for i in range(len(fs.scores_)):
#     if fs.scores_[i] > 50:
#         print(train_data_dummies.columns[i])

train_data_dummies
# test_data
# plot the scores
# plt.bar([i for i in range(len(fs.scores_))], fs.scores_)

Unnamed: 0,win_by_runs,win_by_wickets,city_Abu Dhabi,city_Ahmedabad,city_Bangalore,city_Bloemfontein,city_Cape Town,city_Centurion,city_Chandigarh,city_Chennai,...,toss_winner_Kochi Tuskers Kerala,toss_winner_Kolkata Knight Riders,toss_winner_Mumbai Indians,toss_winner_Pune Warriors,toss_winner_Rajasthan Royals,toss_winner_Rising Pune Supergiants,toss_winner_Royal Challengers Bangalore,toss_winner_Sunrisers Hyderabad,toss_decision_bat,toss_decision_field
0,0.239726,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.000000,0.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.000000,1.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,0.000000,0.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.102740,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691,0.000000,0.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
692,0.000000,0.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
693,0.171233,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
694,0.095890,0.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [168]:
def twins(a, b):
    l = len(a)
    c = []
    for i in range(l):
        if (sorted(a[i][1::2]) == sorted(b[i][1::2])) and (sorted(a[i][::2]) == sorted(b[i][::2])):
            c.append('Yes')
        else:
            c.append('No')
    return c
twins(['abcde'],['cdabe'])

['Yes']

In [None]:
max_depth = 2
n = 50
highest = 0

rf_classifier = RandomForestClassifier(max_depth= 2 , n_estimators= 50)
rf_classifier = rf_classifier.fit(X_train,Y_train)

highest = rf_classifier.score(X_test,Y_test)
for i in range(2,20):
    for j in [50,100,200]:
        rf_classifier = RandomForestClassifier(max_depth= i , n_estimators = j)
        rf_classifier = rf_classifier.fit(X_train,Y_train)
        accuracy1 = rf_classifier.score(X_test,Y_test)
        if accuracy1 > highest:
            max_depth = i
            n = j
            highest = accuracy1
print('max_depth ',max_depth)
print('n ',n)
print('highest ',highest)

In [None]:
# Answer
id, winner
0, 0
1, 1
2, 2
3, 3
4, 0
5, 1
6, 4
7, 5
8, 3
9, 2
10, 5
11, 0
12, 3
13, 6
14, 4
15, 5
16, 1
17, 0
18, 4
19, 2
20, 1
21, 3
22, 0
23, 4
24, 0
25, 2
26, 6
27, 7
28, 0
29, 2
30, 4
31, 3
32, 5
33, 4
34, 7
35, 6
36, 2
37, 5
38, 7
39, 2
40, 0
41, 7
42, 6
43, 4
44, 6
45, 2
46, 1
47, 5
48, -1
49, 0
50, 4
51, 1
52, 2
53, 7
54, 3
55, 4
56, 4
57, 2
58, 0
59, 4