# Importing the Libraries

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [48]:
data = pd.read_csv('IPL.csv')
data.head(4)

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222


In [49]:
data.shape

(76014, 15)

# Features of the dataset
- **mid**: Each match is given a unique number
- **date**: When the match happened
- **venue**: Stadium where match is being played
- **bat_team**: Batting team name
- **bowl_team**: Bowling team name
- **batsman**: Batsman name who faced that ball
- **bowler**: Bowler who bowled that ball
- **runs**: Total runs scored by team at that instance
- **wickets**: Total wickets fallen at that instance
- **overs**: Total overs bowled at that instance
- **runs_last_5**: Total runs scored in last 5 overs
- **wickets_last_5**: Total wickets that fell in last 5 overs
- **striker**: max(runs scored by striker, runs scored by non-striker)
- **non-striker**: min(runs scored by striker, runs scored by non-striker)
- **total**: Total runs scored by batting team after first innings

In [50]:
X = data.iloc[:,[7,8,9,10,11,12,13]].values
y = data.iloc[:,-1].values

## Splitting the datasets

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

# Feature Scaling

In [52]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Multiple Linear Regression

In [53]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train,y_train)

LinearRegression()

In [54]:
y_pred = regression.predict(X_test)

In [55]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.5035600412765215

## Polynomial Regression

In [56]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
poly_regression = LinearRegression()
poly_regression.fit(X_poly,y_train)

LinearRegression()

In [57]:
y_pred = poly_regression.predict(poly_reg.transform(X_test))

In [58]:
r2_score(y_test,y_pred)

0.5357957250889966

## Support Vector Machine

In [59]:
from sklearn.svm import SVR
svm_model = SVR(kernel = 'rbf')
svm_model.fit(X_train,y_train)

SVR()

In [60]:
y_pred = svm_model.predict(X_test)

In [61]:
r2_score(y_test, y_pred)

0.5157844662936631

## Decision Tree

In [62]:
from sklearn.tree import DecisionTreeRegressor
Dt_model = DecisionTreeRegressor(random_state = 0)
Dt_model.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [63]:
y_pred = Dt_model.predict(X_test)
r2_score(y_test,y_pred)

0.5023012088109651

## Random Forest Regression

In [64]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators = 10, random_state = 0)
rf_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [65]:
y_pred = rf_model.predict(X_test)
r2_score(y_test, y_pred)

0.6686012605780673

Here, I have done feature scaling but you don't need feature scaling for Decision tree and Random Forest Regression.

| Model Selection| R2 Score|
| --- | --- |
|Multiple Linear Regression| 0.50|
|Polynomial Regression| 0.53|
|Support Vector Machine| 0.51|
|Decision Tree Regression| 0.50|
|Random Forest Regression| 0.66|