In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [93]:
ball_by_ball = pd.read_csv('./Data/IPL_Ball_by_Ball_2008_2022.csv')
matches_result = pd.read_csv('./Data/IPL_Matches_Result_2008_2022.csv')
ipl_2023_teams = pd.read_csv('./Data/Ipl_2023 _cricketers - Team name.csv').rename(columns={
    'Teams': 'team'
})
ipl_2023_venues = pd.read_csv('./Data/Ipl_2023 _cricketers - Venue.csv').rename(columns={
    'Venue': 'venue'
})

In [94]:
def log(*args):
    print('👉', *args)
    

In [95]:
def to_kebab_case(string):
    return '-'.join(
        string.replace(",", "").replace(".", "").split()
    ).lower()

# Preprocessing 

- ## Change column names, drop unnecessary columns [in ball_by_ball, matches_result]

In [96]:
ball_by_ball_orig = ball_by_ball

ball_by_ball = ball_by_ball.rename(columns={
    'ID': 'match_id',
    'ballnumber': 'ball_number',
    'non-striker': 'non_striker',
    'BattingTeam': 'batting_team',
}).loc[:, [
    'match_id',
    'innings',
    'batting_team',
    'overs',
    'ball_number',
    'batter',
    'bowler',
    'total_run',
]]

In [97]:
matches_result_orig = matches_result

matches_result = matches_result.rename(columns={
    'ID': 'match_id',
    'Team1': 'team_1',
    'Team2': 'team_2',
    'Venue': 'venue',
}).loc[:, [
    'match_id',
    'team_1',
    'team_2',
    'venue',
]]

In [98]:
print(ball_by_ball_orig.shape)
ball_by_ball_orig.head()

(225954, 17)


Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,0,1,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
1,1312200,1,0,2,YBK Jaiswal,Mohammed Shami,JC Buttler,legbyes,0,1,1,0,0,,,,Rajasthan Royals
2,1312200,1,0,3,JC Buttler,Mohammed Shami,YBK Jaiswal,,1,0,1,0,0,,,,Rajasthan Royals
3,1312200,1,0,4,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
4,1312200,1,0,5,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals


In [99]:
print(matches_result_orig.shape)
matches_result_orig.head()

(950, 20)


Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7.0,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7.0,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon
2,1312198,Kolkata,2022-05-25,2022,Eliminator,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata",Lucknow Super Giants,field,N,Royal Challengers Bangalore,Runs,14.0,,RM Patidar,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda...",J Madanagopal,MA Gough
3,1312197,Kolkata,2022-05-24,2022,Qualifier 1,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata",Gujarat Titans,field,N,Gujarat Titans,Wickets,7.0,,DA Miller,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",BNJ Oxenford,VK Sharma
4,1304116,Mumbai,2022-05-22,2022,70,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai",Sunrisers Hyderabad,bat,N,Punjab Kings,Wickets,5.0,,Harpreet Brar,"['PK Garg', 'Abhishek Sharma', 'RA Tripathi', ...","['JM Bairstow', 'S Dhawan', 'M Shahrukh Khan',...",AK Chaudhary,NA Patwardhan


In [100]:
print(ball_by_ball.shape)
ball_by_ball.head()

(225954, 8)


Unnamed: 0,match_id,innings,batting_team,overs,ball_number,batter,bowler,total_run
0,1312200,1,Rajasthan Royals,0,1,YBK Jaiswal,Mohammed Shami,0
1,1312200,1,Rajasthan Royals,0,2,YBK Jaiswal,Mohammed Shami,1
2,1312200,1,Rajasthan Royals,0,3,JC Buttler,Mohammed Shami,1
3,1312200,1,Rajasthan Royals,0,4,YBK Jaiswal,Mohammed Shami,0
4,1312200,1,Rajasthan Royals,0,5,YBK Jaiswal,Mohammed Shami,0


In [101]:
print(matches_result.shape)
matches_result.head()

(950, 4)


Unnamed: 0,match_id,team_1,team_2,venue
0,1312200,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad"
1,1312199,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad"
2,1312198,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata"
3,1312197,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata"
4,1304116,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai"


In [102]:
log('match_id.nunique:', ball_by_ball.match_id.nunique())
log('batting_team.nunique:', ball_by_ball.batting_team.nunique())
log('union1d(batter, bowler).shape:', np.union1d(
    ball_by_ball.batter.unique(), ball_by_ball.bowler.unique()
).shape)
log('innings.unique:', ball_by_ball.innings.unique())
log('overs.unique:', ball_by_ball.overs.unique())

👉 match_id.nunique: 950
👉 batting_team.nunique: 18
👉 union1d(batter, bowler).shape: (652,)
👉 innings.unique: [1 2 3 4 5 6]
👉 overs.unique: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [103]:
log('match_id.nunique:', matches_result.match_id.nunique())
log('venue.nunique:', matches_result.venue.nunique())
log('union1d(team_1, team_2).shape:', np.union1d(
    matches_result.team_1.unique(), matches_result.team_2.unique()
).shape)

👉 match_id.nunique: 950
👉 venue.nunique: 49
👉 union1d(team_1, team_2).shape: (18,)


- ## Get Venues Mapping

In [104]:
matches_result_orig.groupby(['City', 'Venue'], dropna=False)['Venue'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,unique,top,freq
City,Venue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abu Dhabi,Sheikh Zayed Stadium,29,1,Sheikh Zayed Stadium,29
Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",8,1,"Zayed Cricket Stadium, Abu Dhabi",8
Ahmedabad,"Narendra Modi Stadium, Ahmedabad",7,1,"Narendra Modi Stadium, Ahmedabad",7
Ahmedabad,"Sardar Patel Stadium, Motera",12,1,"Sardar Patel Stadium, Motera",12
Bangalore,M Chinnaswamy Stadium,65,1,M Chinnaswamy Stadium,65
Bengaluru,M.Chinnaswamy Stadium,15,1,M.Chinnaswamy Stadium,15
Bloemfontein,OUTsurance Oval,2,1,OUTsurance Oval,2
Cape Town,Newlands,7,1,Newlands,7
Centurion,SuperSport Park,12,1,SuperSport Park,12
Chandigarh,Punjab Cricket Association IS Bindra Stadium,10,1,Punjab Cricket Association IS Bindra Stadium,10


👇: https://www.iplt20.com/matches/schedule/men

In [105]:
venue_mapping_normal = {
  "Arun Jaitley Stadium": "Arun Jaitley Stadium",
  "Arun Jaitley Stadium, Delhi": "Arun Jaitley Stadium",
  "Feroz Shah Kotla": "Arun Jaitley Stadium",
  "Barsapara Cricket Stadium": "Barsapara Cricket Stadium",
  "Barsapara Cricket Stadium, Guwahati": "Barsapara Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Eden Gardens": "Eden Gardens",
  "Eden Gardens, Kolkata": "Eden Gardens",
  "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium",
  "Himachal Pradesh Cricket Association Stadium, Dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "M Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "MA Chidambaram Stadium": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chennai": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk, Chennai": "MA Chidambaram Stadium",
  "Narendra Modi Stadium": "Narendra Modi Stadium",
  "Narendra Modi Stadium, Ahmedabad": "Narendra Modi Stadium",
  "Punjab Cricket Association IS Bindra Stadium": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association IS Bindra Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Rajiv Gandhi International Stadium": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Hyderabad": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium",
  "Sawai Mansingh Stadium": "Sawai Mansingh Stadium",
  "Sawai Mansingh Stadium, Jaipur": "Sawai Mansingh Stadium",
  "Wankhede Stadium": "Wankhede Stadium",
  "Wankhede Stadium, Mumbai": "Wankhede Stadium"
}

In [106]:
venue_mapping_kebab = {
  "arun-jaitley-stadium": "Arun Jaitley Stadium",
  "arun-jaitley-stadium-delhi": "Arun Jaitley Stadium",
  "feroz-shah-kotla": "Arun Jaitley Stadium",
  "barsapara-cricket-stadium": "Barsapara Cricket Stadium",
  "barsapara-cricket-stadium-guwahati": "Barsapara Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium-lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "eden-gardens": "Eden Gardens",
  "eden-gardens-kolkata": "Eden Gardens",
  "himachal-pradesh-cricket-association-stadium": "Himachal Pradesh Cricket Association Stadium",
  "himachal-pradesh-cricket-association-stadium-dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "m-chinnaswamy-stadium": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "ma-chidambaram-stadium": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chennai": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk-chennai": "MA Chidambaram Stadium",
  "narendra-modi-stadium": "Narendra Modi Stadium",
  "narendra-modi-stadium-ahmedabad": "Narendra Modi Stadium",
  "punjab-cricket-association-is-bindra-stadium": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-is-bindra-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "rajiv-gandhi-international-stadium": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-hyderabad": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-uppal": "Rajiv Gandhi International Stadium",
  "sawai-mansingh-stadium": "Sawai Mansingh Stadium",
  "sawai-mansingh-stadium-jaipur": "Sawai Mansingh Stadium",
  "wankhede-stadium": "Wankhede Stadium",
  "wankhede-stadium-mumbai": "Wankhede Stadium"
}

In [107]:
np.setdiff1d(matches_result.venue.unique(), list(venue_mapping_normal.keys()))

array(['Barabati Stadium', 'Brabourne Stadium',
       'Brabourne Stadium, Mumbai', 'Buffalo Park',
       'De Beers Diamond Oval', 'Dr DY Patil Sports Academy',
       'Dr DY Patil Sports Academy, Mumbai',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Dubai International Cricket Stadium', 'Green Park',
       'Holkar Cricket Stadium', 'JSCA International Stadium Complex',
       'Kingsmead', 'Maharashtra Cricket Association Stadium',
       'Maharashtra Cricket Association Stadium, Pune', 'Nehru Stadium',
       'New Wanderers Stadium', 'Newlands', 'OUTsurance Oval',
       'Sardar Patel Stadium, Motera',
       'Saurashtra Cricket Association Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'Sharjah Cricket Stadium', 'Sheikh Zayed Stadium',
       "St George's Park", 'Subrata Roy Sahara Stadium',
       'SuperSport Park', 'Vidarbha Cricket Association Stadium, Jamtha',
       'Zayed Cricket Stadium, Abu Dhabi'], dtype=object)

- ## Get Teams Mapping

In [108]:
set(matches_result['team_1'].unique()) == set(matches_result['team_2'].unique()) == set(ball_by_ball['batting_team'].unique())

True

In [109]:
# Rajasthan Royals
# Gujarat Titans
# Royal Challengers Bangalore
# Lucknow Super Giants
# Sunrisers Hyderabad
# Punjab Kings [Kings XI Punjab]
# Delhi Capitals [Delhi Daredevils]
# Mumbai Indians
# Chennai Super Kings
# Kolkata Knight Riders

team_mapping = { # 10 teams
 'Rajasthan Royals': 'Rajasthan Royals',
 'Gujarat Titans': 'Gujarat Titans',
 'Royal Challengers Bangalore': 'Royal Challengers Bangalore',
 'Lucknow Super Giants': 'Lucknow Super Giants',
 'Sunrisers Hyderabad': 'Sunrisers Hyderabad',
 'Mumbai Indians': 'Mumbai Indians',
 'Chennai Super Kings': 'Chennai Super Kings',
 'Kolkata Knight Riders': 'Kolkata Knight Riders',
    
 'Kings XI Punjab': 'Punjab Kings',
 'Punjab Kings': 'Punjab Kings',
    
 'Delhi Daredevils': 'Delhi Capitals',
 'Delhi Capitals': 'Delhi Capitals',
}

In [110]:
print(np.setdiff1d(
   list(team_mapping.keys()), matches_result['team_1'].unique()
))

print(np.setdiff1d(
    matches_result['team_1'].unique(), list(team_mapping.keys())
))

[]
['Deccan Chargers' 'Gujarat Lions' 'Kochi Tuskers Kerala' 'Pune Warriors'
 'Rising Pune Supergiant' 'Rising Pune Supergiants']


- ## Apply Venues/Teams Mapping [in matches_result, ball_by_ball]

In [111]:
matches_result.venue = matches_result.venue.map(venue_mapping_normal)

matches_result.team_1 = matches_result.team_1.map(team_mapping)
matches_result.team_2 = matches_result.team_2.map(team_mapping)

ball_by_ball.batting_team = ball_by_ball.batting_team.map(team_mapping)

In [116]:
print(matches_result.loc[matches_result.venue.isnull()].shape)

(359, 4)


In [117]:
print(matches_result.loc[matches_result.team_1.isnull()].shape)
print(matches_result.loc[matches_result.team_2.isnull()].shape)

(99, 4)
(96, 4)


In [118]:
print(matches_result.shape)
print(matches_result.dropna().shape)

(950, 4)
(499, 4)


In [120]:
print(ball_by_ball.shape)
print(ball_by_ball.dropna().shape)

(225954, 8)
(202849, 8)


In [27]:

ball_by_ball.loc[ball_by_ball.batting_team.isnull()].shape

(23105, 8)

- ## Remove unnecessary Teams [in ball_by_ball] and Venues [in matches_result]

In [28]:
matches_result = matches_result.dropna(subset=['team_1', 'team_2', 'venue'])
# matches_result = matches_result.dropna(subset=['venue'])

print(matches_result_orig.shape)
print(matches_result.shape)

(950, 20)
(279, 4)


In [29]:
ball_by_ball = ball_by_ball.dropna(subset=['batting_team'])

print(ball_by_ball_orig.shape)
print(ball_by_ball.shape)

(225954, 17)
(202849, 8)


- ## Select first 6 overs, Select innings 1 & 2, Map innings (1,2) to (0,1) [in ball_by_ball]

In [30]:
ball_by_ball.innings.unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [31]:
ball_by_ball.overs.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

In [32]:
ball_by_ball = ball_by_ball.loc[(ball_by_ball.overs <= 5) & (ball_by_ball.innings <= 2)]
ball_by_ball.innings = ball_by_ball.innings.replace({1: 0, 2: 1})
ball_by_ball.shape

(63652, 8)

In [33]:
ball_by_ball.innings.unique()

array([0, 1], dtype=int64)

In [34]:
ball_by_ball.overs.unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

- ## Grouping 

In [35]:
ball_by_ball_gb = ball_by_ball.groupby(['match_id', 'innings', 'batting_team'])
total_runs = ball_by_ball_gb['total_run'].sum()
batsmen = ball_by_ball_gb['batter'].unique()
bowlers = ball_by_ball_gb['bowler'].unique()

In [36]:
total_runs = total_runs.to_frame(name = 'total_runs').reset_index()
batsmen = batsmen.to_frame(name = 'batsmen').reset_index()
bowlers = bowlers.to_frame(name = 'bowlers').reset_index()

In [37]:
data = total_runs.merge(
    batsmen.merge(bowlers, how='right', on=['match_id','innings','batting_team']),
    how='right', on=['match_id','innings','batting_team']
)

In [38]:
data = data.merge(matches_result, on=['match_id'])

In [39]:
mask = data['batting_team'] == data['team_1']
data.loc[mask, 'bowling_team'] = data['team_2']
data.loc[~mask, 'bowling_team'] = data['team_1']

In [40]:
# match_id == 829763, data for one innings is missing
# match_id == 829813, total_runs for one innings is 2 (probably a mistake in data entry)
data = data.drop(data[(data['match_id'] == 829763) | (data['match_id'] == 829813)].index)

In [41]:
data['count_batsmen'] = [len(x) for x in data['batsmen']]
data['count_bowlers'] = [len(x) for x in data['bowlers']]

In [42]:
data = data.drop(columns=['match_id', 'batsmen', 'bowlers', 'team_1', 'team_2'])
data = data[['venue', 'innings', 'batting_team', 'bowling_team', 'count_batsmen', 'count_bowlers', 'total_runs']]

In [43]:
data

Unnamed: 0,venue,innings,batting_team,bowling_team,count_batsmen,count_bowlers,total_runs
0,"{'aliases': ['M Chinnaswamy Stadium, Bengaluru...",0,Kolkata Knight Riders,Royal Challengers Bangalore,3,3,61
1,"{'aliases': ['M Chinnaswamy Stadium, Bengaluru...",1,Royal Challengers Bangalore,Kolkata Knight Riders,6,3,26
2,"{'aliases': ['Wankhede Stadium, Mumbai'], 'tag...",0,Mumbai Indians,Royal Challengers Bangalore,5,3,47
3,"{'aliases': ['Wankhede Stadium, Mumbai'], 'tag...",1,Royal Challengers Bangalore,Mumbai Indians,3,3,40
4,"{'aliases': ['Sawai Mansingh Stadium, Jaipur']...",0,Punjab Kings,Rajasthan Royals,3,3,54
...,...,...,...,...,...,...,...
552,"{'aliases': ['Wankhede Stadium, Mumbai'], 'tag...",1,Mumbai Indians,Kolkata Knight Riders,2,4,46
553,"{'aliases': ['MA Chidambaram Stadium, Chennai'...",0,Chennai Super Kings,Mumbai Indians,4,5,32
554,"{'aliases': ['MA Chidambaram Stadium, Chennai'...",1,Mumbai Indians,Chennai Super Kings,4,2,44
555,{'aliases': ['Rajiv Gandhi International Stadi...,0,Mumbai Indians,Chennai Super Kings,4,3,45


- ## Encoding of categorical inputs and feature scaling

In [44]:
X = data.iloc[:, :-1]
y = data["total_runs"]

In [45]:
ct = ColumnTransformer(transformers = [
    ('ohe', OneHotEncoder(categories = "auto", drop='first', sparse_output=False), ['venue', 'batting_team', 'bowling_team'])
], remainder = 'passthrough')

scaler = StandardScaler()

X_ohe = pd.DataFrame(ct.fit_transform(X))
X_std = scaler.fit_transform(X_ohe)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['dict']

- ## Train-test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2)

In [None]:
def evaluate(regressor):
    regressor.fit(X_train, y_train)
    y_pred = np.round(regressor.predict(X_test), 2) # Round predictions to 2 decimal places
    rmse = np.sqrt(((y_test - y_pred) ** 2).mean()) # RMSE calculation
    mae = np.abs((y_test - y_pred)).mean() # MAE calculation
    print(f"RMSE: {rmse:.2f}") # Use f-string to format output
    print(f"MAE: {mae:.2f}") # Use f-string to format output    

- ## Models

In [None]:
# from sklearn.metrics import r2_score
# AdaBoostRegressor(learning_rate=0.15, loss='exponential', n_estimators=20,
#                   random_state=2154)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
evaluate(regressor)

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
evaluate(regressor)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
evaluate(regressor)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()
evaluate(regressor)

In [None]:
from sklearn.svm import SVR
regressor = SVR()
evaluate(regressor)

In [None]:
import xgboost as xgb
regressor = xgb.XGBRegressor()
evaluate(regressor)

In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers, models

# # Define the model architecture
# model = models.Sequential([
#     layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
#     layers.Dense(128, activation='relu'),
#     layers.Dense(1)
# ])

# # Compile the model
# model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])

# # Fit the model to the training data
# history = model.fit(X_train, y_train, epochs=200, batch_size=128, verbose=False)

# # Evaluate the model on the test set
# test_loss = model.evaluate(X_test, y_test)

# # Print the test loss
# print('Test loss:', test_loss)

In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers, models

# # Define a matrix of hyperparameters to test
# params = {
#     'batch_size': [16, 32],
#     'epochs': [50, 100],
#     'learning_rate': [0.001, 0.01]
# }

# # Define the model architecture
# def build_model(learning_rate=0.001):
#     model = models.Sequential([
#         layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
#         layers.Dense(32, activation='relu'),
#         layers.Dense(1)
#     ])
#     optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
#     return model

# # Loop through the hyperparameter matrix and fit the model for each combination
# for batch_size in params['batch_size']:
#     for epochs in params['epochs']:
#         for learning_rate in params['learning_rate']:
#             print(f"Fitting model with batch_size={batch_size}, epochs={epochs}, learning_rate={learning_rate}")
#             model = build_model(learning_rate=learning_rate)
#             history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)
#             test_loss, test_mae = model.evaluate(X_test, y_test)
#             print(f"Test loss: {test_loss}, Test MAE: {test_mae}")
