In [1]:
# importing all the libraries that we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
world_cup = pd.read_csv('/kaggle/input/iml-project-2-dataset/World Cup 2019 Dataset.csv')
results = pd.read_csv('/kaggle/input/iml-project-2-dataset/results.csv')

In [3]:
world_cup

Unnamed: 0,Team,Group,Previous \r\nappearances,Previous \r\ntitles,Previous\r\n finals,Previous\r\n semifinals,Current rank
0,England,A,11,0,3,5,1
1,South Africa,A,6,0,0,4,3
2,West Indies,A,11,2,3,4,8
3,Pakistan,A,11,1,2,6,6
4,New Zealand,A,11,0,1,7,4
5,Sri Lanka,A,11,1,3,4,9
6,Afghanistan,A,1,0,0,0,10
7,Australia,A,11,5,6,7,5
8,Bangladesh,A,5,0,0,0,7
9,India,A,11,2,3,6,2


In [4]:
results

Unnamed: 0,date,Team_1,Team_2,Winner,Margin,Ground
0,4-Jan-10,Bangladesh,Sri Lanka,Sri Lanka,7 wickets,Dhaka
1,5-Jan-10,India,Sri Lanka,Sri Lanka,5 wickets,Dhaka
2,7-Jan-10,Bangladesh,India,India,6 wickets,Dhaka
3,8-Jan-10,Bangladesh,Sri Lanka,Sri Lanka,9 wickets,Dhaka
4,10-Jan-10,India,Sri Lanka,India,8 wickets,Dhaka
...,...,...,...,...,...,...
991,22-Oct-17,India,New Zealand,New Zealand,6 wickets,Mumbai
992,22-Oct-17,South Africa,Bangladesh,South Africa,200 runs,East London
993,23-Oct-17,Pakistan,Sri Lanka,Pakistan,9 wickets,Sharjah
994,25-Oct-17,India,New Zealand,India,6 wickets,Pune


In [5]:
#  Data cleaning and formatting
# display the details of matches played by India.
df = results[(results['Team_1'] == 'India') | (results['Team_2'] == 'India')]
india = df.iloc[:]
india.head()

Unnamed: 0,date,Team_1,Team_2,Winner,Margin,Ground
1,5-Jan-10,India,Sri Lanka,Sri Lanka,5 wickets,Dhaka
2,7-Jan-10,Bangladesh,India,India,6 wickets,Dhaka
4,10-Jan-10,India,Sri Lanka,India,8 wickets,Dhaka
5,11-Jan-10,Bangladesh,India,India,6 wickets,Dhaka
6,13-Jan-10,India,Sri Lanka,Sri Lanka,4 wickets,Dhaka


In [6]:
#  creating a column to display the details of matches played in 2010 and taking it as a reference for future work
year = []
for row in india['date']:
    year.append(int(row[7:]))
india ['match_year']= year
india_2010 = india[india.match_year >= 10]
india_2010.count()

date          151
Team_1        151
Team_2        151
Winner        151
Margin        142
Ground        151
match_year    151
dtype: int64

In [7]:
# Exploratory data analysis
# merged the details of the teams participating this year with their past results.
worldcup_teams = ['England', ' South Africa', '', 'West Indies', 
            'Pakistan', 'New Zealand', 'Sri Lanka', 'Afghanistan', 
            'Australia', 'Bangladesh', 'India']
df_teams_1 = results[results['Team_1'].isin(worldcup_teams)]
df_teams_2 = results[results['Team_2'].isin(worldcup_teams)]
df_teams = pd.concat((df_teams_1, df_teams_2))
df_teams.drop_duplicates()
df_teams.count()

date      1445
Team_1    1445
Team_2    1445
Winner    1445
Margin    1368
Ground    1445
dtype: int64

In [8]:
# deleted the columns like date of the match, margin of victory, and the ground on which the match was played
df_teams_2010 = df_teams.drop(['date','Margin', 'Ground'], axis=1)
df_teams_2010.head()

Unnamed: 0,Team_1,Team_2,Winner
0,Bangladesh,Sri Lanka,Sri Lanka
1,India,Sri Lanka,Sri Lanka
2,Bangladesh,India,India
3,Bangladesh,Sri Lanka,Sri Lanka
4,India,Sri Lanka,India


In [9]:
# created the model. If team-1 won the match, I assigned it label 1, else if team-2 won, I assigned it label 2
df_teams_2010 = df_teams_2010.reset_index(drop=True)
df_teams_2010.loc[df_teams_2010.Winner == df_teams_2010.Team_1,'winning_team']=1
df_teams_2010.loc[df_teams_2010.Winner == df_teams_2010.Team_2, 'winning_team']=2
df_teams_2010 = df_teams_2010.drop(['winning_team'], axis=1)
df_teams_2010.head()

Unnamed: 0,Team_1,Team_2,Winner
0,Bangladesh,Sri Lanka,Sri Lanka
1,India,Sri Lanka,Sri Lanka
2,Bangladesh,India,India
3,Bangladesh,Sri Lanka,Sri Lanka
4,India,Sri Lanka,India


In [10]:
#  separated training and test sets with 70% and 30% in training and validation sets respectively
final = pd.get_dummies(df_teams_2010, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])
X = final.drop(['Winner'], axis=1)
y = final["Winner"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [11]:
# Perform hyperparameter tuning on the best model
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0) 
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
score2 = rf.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score)) 
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.700
Test set accuracy:  0.675


In [12]:
# Evaluate the best model on the testing set
# added ICC rankings of teams giving priority to higher ranked team to win this year
ranking = pd.read_csv('/kaggle/input/iml-project-2-dataset/icc_rankings.csv') 
fixtures = pd.read_csv('/kaggle/input/iml-project-2-dataset/fixtures.csv')
pred_set = []

In [13]:
# added new columns with ranking position for each team and slicing the dataset for 
# first 45 games since there are 45 league stage games in tota
fixtures.insert(1, 'first_position', fixtures['Team_1'].map(ranking.set_index('Team')['Position']))
fixtures.insert(2, 'second_position', fixtures['Team_2'].map(ranking.set_index('Team')['Position']))

fixtures = fixtures.iloc[:45, :]
fixtures.tail()

Unnamed: 0,Round Number,first_position,second_position,Date,Location,Team_1,Team_2,Group,Result
40,1,1.0,4.0,3/7/2019,"Riverside Ground, Chester-le-Street",England,New Zealand,Group A,
41,1,10.0,8.0,4/7/2019,"Headingley, Leeds",Afghanistan,West Indies,Group A,
42,1,6.0,7.0,5/7/2019,"Lord's, London",Pakistan,Bangladesh,Group A,
43,1,9.0,2.0,6/7/2019,"Headingley, Leeds",Sri Lanka,India,Group A,
44,1,5.0,3.0,6/7/2019,"Emirates Old Trafford, Manchester",Australia,South Africa,Group A,


In [14]:
# added teams to new prediction dataset based on ranking position of each team
for index, row in fixtures.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'Team_1': row['Team_1'], 'Team_2': row['Team_2'], 'winning_team': None})
    else:
        pred_set.append({'Team_1': row['Team_2'], 'Team_2': row['Team_1'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set
pred_set.head()

Unnamed: 0,Team_1,Team_2,winning_team
0,England,South Africa,
1,Pakistan,West Indies,
2,New Zealand,Sri Lanka,
3,Australia,Afghanistan,
4,South Africa,Bangladesh,


In [15]:
# I added scripts for getting dummy variables and added missing columns compared to model training dataset
pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]


pred_set = pred_set.drop(['Winner'], axis=1)
pred_set.head()

Unnamed: 0,Team_1_Afghanistan,Team_1_Australia,Team_1_Bangladesh,Team_1_Canada,Team_1_England,Team_1_India,Team_1_Ireland,Team_1_Kenya,Team_1_Netherlands,Team_1_New Zealand,...,Team_2_Kenya,Team_2_Netherlands,Team_2_New Zealand,Team_2_Pakistan,Team_2_Scotland,Team_2_South Africa,Team_2_Sri Lanka,Team_2_U.A.E.,Team_2_West Indies,Team_2_Zimbabwe
0,0,False,False,0,True,False,0,0,0,False,...,0,0,False,False,0,True,False,0,False,0
1,0,False,False,0,False,False,0,0,0,False,...,0,0,False,False,0,False,False,0,True,0
2,0,False,False,0,False,False,0,0,0,True,...,0,0,False,False,0,False,True,0,False,0
3,0,True,False,0,False,False,0,0,0,False,...,0,0,False,False,0,False,False,0,False,0
4,0,False,False,0,False,False,0,0,0,False,...,0,0,False,False,0,False,False,0,False,0


In [16]:
# Interpret the model results
# getting the results for each and every league stage match
predictions = rf.predict(pred_set)
for i in range(fixtures.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 1:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    
    else:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print("")

South Africa and England
Winner: England

West Indies and Pakistan
Winner: Pakistan

Sri Lanka and New Zealand
Winner: New Zealand

Afghanistan and Australia
Winner: Australia

Bangladesh and South Africa
Winner: South Africa

Pakistan and England
Winner: England

Afghanistan and Sri Lanka
Winner: Sri Lanka

South Africa and India
Winner: India

Bangladesh and New Zealand
Winner: New Zealand

West Indies and Australia
Winner: Australia

Sri Lanka and Pakistan
Winner: Pakistan

Bangladesh and England
Winner: England

Afghanistan and New Zealand
Winner: New Zealand

Australia and India
Winner: India

West Indies and South Africa
Winner: South Africa

Sri Lanka and Bangladesh
Winner: Bangladesh

Pakistan and Australia
Winner: Australia

New Zealand and India
Winner: India

West Indies and England
Winner: England

Sri Lanka and Australia
Winner: Australia

Afghanistan and South Africa
Winner: South Africa

Pakistan and India
Winner: India

West Indies and Bangladesh
Winner: Bangladesh

Afg

In [17]:
#  the four teams to march to the semi finals are New Zealand, India, England and South Africa
semi = [('New Zealand', 'India'),
            ('England', 'South Africa')]

In [18]:
# I created a function to repeat the above work. This is the final function to predict 
# the winner of ICC Cricket World Cup 2019
def clean_and_predict(matches, ranking, final, rf):
    positions = []
    for match in matches:
        positions.append(ranking.loc[ranking['Team'] == match[0], 'Position'].iloc[0])
        positions.append(ranking.loc[ranking['Team'] == match[1], 'Position'].iloc[0])

    pred_set = []

    i = 0
    j = 0

    while i < len(positions):
        dict1 = {}

        if positions[i] < positions[i + 1]:
            dict1.update({'Team_1': matches[j][0], 'Team_2': matches[j][1]})
        else:
            dict1.update({'Team_1': matches[j][1], 'Team_2': matches[j][0]})

        pred_set.append(dict1)
        i += 2
        j += 1

    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set  
    print(pred_set.columns)

    pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    pred_set = pred_set.drop(['Winner'], axis=1)

    predictions = rf.predict(pred_set)  

    for i in range(len(pred_set)):
        print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
        if predictions[i] == 1:
            print("Winner: " + backup_pred_set.iloc[i, 1])
        else:
            print("Winner: " + backup_pred_set.iloc[i, 0])
        print("")

In [19]:
clean_and_predict(semi, ranking, final, rf)

Index(['Team_1', 'Team_2'], dtype='object')
New Zealand and India
Winner: India

South Africa and England
Winner: England



In [20]:
finals = [('India', 'England')]

In [21]:
# Finally on running the main function
clean_and_predict(finals, ranking, final, rf)

Index(['Team_1', 'Team_2'], dtype='object')
India and England
Winner: England

