In [18]:
#import all libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [19]:
#load data 
IPL = pd.read_csv('datasets/IPL 2020 Dataset.csv')
results = pd.read_csv('datasets/results.csv')

IPL.head()

Unnamed: 0,Team,Group,Previous appearances,Previous titles,Previous finals,Previous semifinals,Current rank
0,Mumbai Indians,A,12,4,5,8,1
1,Chennai Super Kings,A,10,3,8,10,2
2,Delhi Capitals,A,12,0,0,4,7
3,Kings XI Punjab,A,12,0,1,2,6
4,Royal Challengers Bangalore,A,12,0,3,5,8


In [20]:
results.head()

Unnamed: 0,date,Team_1,Team_2,winner,Margin,Ground
0,2008-04-19,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,33 runs,Chandigarh
1,2008-04-23,Chennai Super Kings,Mumbai Indians,Chennai Super Kings,6 runs,Chennai
2,2008-04-26,Kolkata Knight Riders,Chennai Super Kings,Chennai Super Kings,9 wickets,Chennai
3,2008-04-28,Chennai Super Kings,Royal Challengers Bangalore,Chennai Super Kings,13 runs,Bangalore
4,2008-05-08,Delhi Daredevils,Chennai Super Kings,Chennai Super Kings,4 wickets,Delhi


In [21]:
df = results[(results['Team_1'] == 'Chennai Super Kings') | (results['Team_2'] == 'Chennai Super Kings')]
india = df.iloc[:]
india.head()

Unnamed: 0,date,Team_1,Team_2,winner,Margin,Ground
0,2008-04-19,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,33 runs,Chandigarh
1,2008-04-23,Chennai Super Kings,Mumbai Indians,Chennai Super Kings,6 runs,Chennai
2,2008-04-26,Kolkata Knight Riders,Chennai Super Kings,Chennai Super Kings,9 wickets,Chennai
3,2008-04-28,Chennai Super Kings,Royal Challengers Bangalore,Chennai Super Kings,13 runs,Bangalore
4,2008-05-08,Delhi Daredevils,Chennai Super Kings,Chennai Super Kings,4 wickets,Delhi


In [22]:
india['date']

0      2008-04-19
1      2008-04-23
2      2008-04-26
3      2008-04-28
4      2008-05-08
          ...    
662    2014-05-18
692    2019-04-21
709    2014-05-22
713    2015-05-02
749    2019-04-17
Name: date, Length: 164, dtype: object

In [23]:
# patcipate team in the IPL

IPL_Teams = ['Mumbai Indians', 'Chennai Super Kings', 'Delhi Capitals', 'Kings XI Punjab', 
            'Royal Challengers Bangalore', 'Kolkata Knight Riders', 'Sun Risers Hyderabad', 'Rajasthan Royals']
df_teams_1 = results[results['Team_1'].isin(IPL_Teams)]
df_teams_2 = results[results['Team_2'].isin(IPL_Teams)]
df_teams = pd.concat((df_teams_1, df_teams_2))
df_teams.drop_duplicates()
df_teams.count()

date      1048
Team_1    1048
Team_2    1048
winner    1048
Margin    1048
Ground    1048
dtype: int64

In [24]:
df_teams.head()

Unnamed: 0,date,Team_1,Team_2,winner,Margin,Ground
0,2008-04-19,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,33 runs,Chandigarh
1,2008-04-23,Chennai Super Kings,Mumbai Indians,Chennai Super Kings,6 runs,Chennai
2,2008-04-26,Kolkata Knight Riders,Chennai Super Kings,Chennai Super Kings,9 wickets,Chennai
3,2008-04-28,Chennai Super Kings,Royal Challengers Bangalore,Chennai Super Kings,13 runs,Bangalore
5,2008-05-10,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,18 runs,Chennai


In [25]:
#dropping columns that wll not affect match outcomes
df_teams_2010 = df_teams.drop(['date','Margin', 'Ground'], axis=1)
df_teams_2010.head()

Unnamed: 0,Team_1,Team_2,winner
0,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings
1,Chennai Super Kings,Mumbai Indians,Chennai Super Kings
2,Kolkata Knight Riders,Chennai Super Kings,Chennai Super Kings
3,Chennai Super Kings,Royal Challengers Bangalore,Chennai Super Kings
5,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings


In [26]:
#Building the model
#the prediction label: The winning_team column will show "1" Team 1 has won.

df_teams_2010 = df_teams_2010.reset_index(drop=True)
df_teams_2010.loc[df_teams_2010.winner == df_teams_2010.Team_1,'winning_team']=1
df_teams_2010.loc[df_teams_2010.winner == df_teams_2010.Team_2, 'winning_team']=2
df_teams_2010 = df_teams_2010.drop(['winning_team'], axis=1)

df_teams_2010.head()

Unnamed: 0,Team_1,Team_2,winner
0,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings
1,Chennai Super Kings,Mumbai Indians,Chennai Super Kings
2,Kolkata Knight Riders,Chennai Super Kings,Chennai Super Kings
3,Chennai Super Kings,Royal Challengers Bangalore,Chennai Super Kings
4,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings


In [27]:
#convert team_1 and team_2 from categorical variables to continous inputs 

# Get dummy variables
final = pd.get_dummies(df_teams_2010, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

# Separate X and y sets
X = final.drop(['winner'], axis=1)
y = final["winner"]


# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

final.head()

Unnamed: 0,winner,Team_1_Chennai Super Kings,Team_1_Deccan Chargers,Team_1_Delhi Capitals,Team_1_Delhi Daredevils,Team_1_Gujarat Lions,Team_1_Kings XI Punjab,Team_1_Kochi Tuskers Kerala,Team_1_Kolkata Knight Riders,Team_1_Mumbai Indians,...,Team_2_Kings XI Punjab,Team_2_Kochi Tuskers Kerala,Team_2_Kolkata Knight Riders,Team_2_Mumbai Indians,Team_2_Pune Warriors,Team_2_Rajasthan Royals,Team_2_Rising Pune Supergiant,Team_2_Rising Pune Supergiants,Team_2_Royal Challengers Bangalore,Team_2_Sunrisers Hyderabad
0,Chennai Super Kings,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,Chennai Super Kings,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Chennai Super Kings,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Chennai Super Kings,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Chennai Super Kings,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [28]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20,
                              random_state=0)
rf.fit(X_train, y_train) 

score = rf.score(X_train, y_train)
score2 = rf.score(X_test, y_test)


print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.686
Test set accuracy:  0.543


In [29]:
#Adding IPL rankings
#the team which is positioned higher on the IPL Ranking will be considered "favourite" for the match
#and therefore, will be positioned under the "Team_1" column

# Loading new datasets

fixtures = pd.read_csv('datasets/fixtures.csv')
ranking = pd.read_csv('datasets/ipl_rankings.csv') 

# List for storing the group stage games

pred_set = []

In [30]:
# Create new columns with ranking position of each team

fixtures.insert(1, 'first_position', fixtures['Team_1'].map(ranking.set_index('Team')['Position']))
fixtures.insert(2, 'second_position', fixtures['Team_2'].map(ranking.set_index('Team')['Position']))

# We only need the group stage games, so we have to slice the dataset

fixtures = fixtures.iloc[:56, :]
fixtures.tail()



Unnamed: 0,Round Number,first_position,second_position,Date,Location,Team_1,Team_2,Group,Result
51,1,3.0,8.0,13-May-20,Delhi,Delhi Capitals,Rajasthan Royals,Group A,
52,1,5.0,2.0,14-May-20,Bengaluru,Royal Challengers Bangalore,Chennai Super Kings,Group A,
53,1,6.0,7.0,15-May-20,Kolkata,Kolkata Knight Riders,Sun Risers Hyderabad,Group A,
54,1,4.0,3.0,16-May-20,Mohali,Kings XI Punjab,Delhi Capitals,Group A,
55,1,5.0,1.0,17-May-20,Bengaluru,Royal Challengers Bangalore,Mumbai Indians,Group A,


In [31]:
# Loop to add teams to new prediction dataset based on the ranking position of each team
for index, row in fixtures.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'Team_1': row['Team_1'], 'Team_2': row['Team_2'], 'winning_team': None})
    else:
        pred_set.append({'Team_1': row['Team_2'], 'Team_2': row['Team_1'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set
pred_set.head()

Unnamed: 0,Team_1,Team_2,winning_team
0,Mumbai Indians,Chennai Super Kings,
1,Delhi Capitals,Kings XI Punjab,
2,Royal Challengers Bangalore,Kolkata Knight Riders,
3,Kolkata Knight Riders,Sun Risers Hyderabad,
4,Chennai Super Kings,Rajasthan Royals,


In [32]:
# Get dummy variables and drop winning_team column

pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

# Add missing columns compared to the model's training dataset

missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]


pred_set = pred_set.drop(['winner'], axis=1)
pred_set.head()

Unnamed: 0,Team_1_Chennai Super Kings,Team_1_Deccan Chargers,Team_1_Delhi Capitals,Team_1_Delhi Daredevils,Team_1_Gujarat Lions,Team_1_Kings XI Punjab,Team_1_Kochi Tuskers Kerala,Team_1_Kolkata Knight Riders,Team_1_Mumbai Indians,Team_1_Pune Warriors,...,Team_2_Kings XI Punjab,Team_2_Kochi Tuskers Kerala,Team_2_Kolkata Knight Riders,Team_2_Mumbai Indians,Team_2_Pune Warriors,Team_2_Rajasthan Royals,Team_2_Rising Pune Supergiant,Team_2_Rising Pune Supergiants,Team_2_Royal Challengers Bangalore,Team_2_Sunrisers Hyderabad
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [33]:
#group matches 
predictions = rf.predict(pred_set)
for i in range(fixtures.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 1:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    
    else:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print("")

Chennai Super Kings and Mumbai Indians
Winner: Mumbai Indians

Kings XI Punjab and Delhi Capitals
Winner: Delhi Capitals

Kolkata Knight Riders and Royal Challengers Bangalore
Winner: Royal Challengers Bangalore

Sun Risers Hyderabad and Kolkata Knight Riders
Winner: Kolkata Knight Riders

Rajasthan Royals and Chennai Super Kings
Winner: Chennai Super Kings

Kolkata Knight Riders and Delhi Capitals
Winner: Delhi Capitals

Sun Risers Hyderabad and Kings XI Punjab
Winner: Kings XI Punjab

Royal Challengers Bangalore and Mumbai Indians
Winner: Mumbai Indians

Rajasthan Royals and Delhi Capitals
Winner: Delhi Capitals

Kolkata Knight Riders and Chennai Super Kings
Winner: Chennai Super Kings

Sun Risers Hyderabad and Royal Challengers Bangalore
Winner: Royal Challengers Bangalore

Kings XI Punjab and Mumbai Indians
Winner: Mumbai Indians

Rajasthan Royals and Kolkata Knight Riders
Winner: Kolkata Knight Riders

Royal Challengers Bangalore and Delhi Capitals
Winner: Delhi Capitals

Kings XI

In [34]:
# List of tuples before 

semi = [('Kolkata Knight Riders', 'Chennai Super Kings'),
            ('Mumbai Indians', 'Rajasthan Royals')]

In [35]:
def clean_and_predict(matches, ranking, final, logreg):

    # Initialization of auxiliary list for data cleaning
    positions = []

    # Loop to retrieve each team's position according to ICC ranking
    for match in matches:
        positions.append(ranking.loc[ranking['Team'] == match[0],'Position'].iloc[0])
        positions.append(ranking.loc[ranking['Team'] == match[1],'Position'].iloc[0])
    
    # Creating the DataFrame for prediction
    pred_set = []

    # Initializing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'positions' list, and 'j' for the list of matches (list of tuples)
    while i < len(positions):
        dict1 = {}

        # If position of first team is better then this team will be the 'Team_1' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'Team_1': matches[j][0], 'Team_2': matches[j][1]})
        else:
            dict1.update({'Team_1': matches[j][1], 'Team_2': matches[j][0]})

        # Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1
        
        # Convert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    # Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

    # Add missing columns compared to the model's training dataset
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    pred_set = pred_set.drop(['winner'], axis=1)

    # Predict!
    predictions = logreg.predict(pred_set)
    for i in range(len(pred_set)):
        print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
        if predictions[i] == 1:
            print("Winner: " + backup_pred_set.iloc[i, 1])
        else:
            print("Winner: " + backup_pred_set.iloc[i, 0])
        print("")

In [36]:
clean_and_predict(semi, ranking, final, rf)

Kolkata Knight Riders and Chennai Super Kings
Winner: Chennai Super Kings

Rajasthan Royals and Mumbai Indians
Winner: Mumbai Indians



In [37]:
# Finals

finals = [('Chennai Super Kings', 'Mumbai Indians')]
clean_and_predict(finals, ranking, final, rf)

Chennai Super Kings and Mumbai Indians
Winner: Mumbai Indians

