In [1]:
import pandas as pd
import numpy as np
import datetime

import math
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [2]:
results = pd.read_csv('results_info.csv')
events = pd.read_csv('events_info.csv')
ranks = pd.read_csv('rank_info.csv')

### Create a copy & preprocess results data

In [3]:
df = results
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,event,map,team_1,team_2
0,0,0.0,24/10/22,CCT North Europe Series 1,['Ancient'],Entropiq (14),1WIN (16)
1,1,1.0,24/10/22,CCT Central Europe Series 3,['Vertigo'],ECLOT (16),FTW (12)


In [4]:
# drop index col
df = df.drop(columns=['Unnamed: 0'])
events = events.drop(columns=['Unnamed: 0'])
ranks = ranks.drop(columns=['Unnamed: 0'])

In [5]:
# split date column into relevant time features
df.date=pd.to_datetime(df.date, format="%d/%m/%y")

df['day'] = df.date.dt.day
df['week'] = df.date.dt.week
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year

# # drop date column
# df = df.drop(columns=['date'])

  df['week'] = df.date.dt.week


In [6]:
# clean map column
df['map'] = df['map'].str.strip("[]'")

In [7]:
# split team 1 and team 2 columns
df[['team1_name', 'team1_score']] = df['team_1'].str.split('(', 1, expand=True)
df['team1_name'] = df['team1_name'].str.strip(" ")
df['team1_score'] = df['team1_score'].str.strip(")")
df[['team2_name', 'team2_score']] = df['team_2'].str.split('(', 1, expand=True)
df['team2_name'] = df['team2_name'].str.strip(" ")
df['team2_score'] = df['team2_score'].str.strip(")")

# drop raw columns
df = df.drop(columns=['team_1', 'team_2'])

In [8]:
# change datatype of column after splitting
df['day'] = df['day'].astype(int)
df['month'] = df['month'].astype(int)
df['year'] = df['year'].astype(int)
df['team1_score'] = df['team1_score'].astype(int)
df['team2_score'] = df['team2_score'].astype(int)

In [9]:
# function to determine winner
def winner(row):
    if row['team1_score'] > row['team2_score']:
        val = row['team1_name']
    else:
        val = row['team2_name']
    return val

In [10]:
# create feature 'winner'
df['winner'] = df.apply(winner, axis=1)
df.head(3)

Unnamed: 0,Unnamed: 0.1,date,event,map,day,week,month,year,team1_name,team1_score,team2_name,team2_score,winner
0,0.0,2022-10-24,CCT North Europe Series 1,Ancient,24,43,10,2022,Entropiq,14,1WIN,16,1WIN
1,1.0,2022-10-24,CCT Central Europe Series 3,Vertigo,24,43,10,2022,ECLOT,16,FTW,12,ECLOT
2,2.0,2022-10-24,CCT Central Europe Series 3,Inferno,24,43,10,2022,FTW,16,ECLOT,14,FTW


In [11]:
# ## Discontinued due to poor performance

# # create feature days since
# today = datetime.date.today()

# def ds(row):
#     val = today - row['date'].date()
#     return val

# df['days_since'] = df.apply(ds, axis=1)
# df['days_since'] = df['days_since'].astype(str)
# df['days_since'] = df['days_since'].map(lambda x: x.rstrip(' days'))

In [12]:
# drop duplicates
df.drop_duplicates(inplace=True)

### Merge with event details

In [13]:
# add event details
df = df.merge(events, how='left', on='event')

In [14]:
# drop unwanted columns - undo if want duration of tourney
df = df.drop(columns=['start_end'])

In [15]:
df.head(3)

Unnamed: 0,Unnamed: 0.1,date,event,map,day,week,month,year,team1_name,team1_score,team2_name,team2_score,winner,num_teams,prize,type,region
0,0.0,2022-10-24,CCT North Europe Series 1,Ancient,24,43,10,2022,Entropiq,14,1WIN,16,1WIN,24,"$50,000",Online,Europe (Online) |
1,0.0,2022-10-24,CCT North Europe Series 1,Ancient,24,43,10,2022,Entropiq,14,1WIN,16,1WIN,24,"$50,000",Online,Europe (Online) |
2,1.0,2022-10-24,CCT Central Europe Series 3,Vertigo,24,43,10,2022,ECLOT,16,FTW,12,ECLOT,24,"$50,000",Online,Europe (Online) |


In [16]:
### convert prize column appropriately
# reflect qualification as prize
df['prize_qual'] = df['prize']

def qual(row):
    if row['prize_qual'] == 'Other':
        val = 1
    else:
        val = 0
    return val

df['prize_qual'] = df.apply(qual, axis=1)

In [17]:
# reflect money as prize
df['prize'] = df['prize'].str.strip('$')
df['prize'] = df['prize'].str.replace(',', '')

def prize(row):
    if row['prize'] == 'Other':
        val = 0
    else:
        val = row['prize']
    return val

df['prize'] = df.apply(prize, axis=1)

df['prize'] = df['prize'].astype(float)

In [18]:
# create feature premium to represent tournament
keywords = ['IEM', 'ESL', 'BLAST']

# function
def prem(row):
    if any(word in row['event'] for word in keywords):
        val = 1
    else:
        val = 0
    return val
    
df['premium'] = df.apply(prem, axis=1)    

In [19]:
# understanding the different classification types
temp = df['type'].unique()
temp

array(['Online', 'Local LAN', 'Reg. LAN', nan, 'Intl. LAN', 'Major'],
      dtype=object)

In [20]:
# function to rename tournament types, use try because of ongoing events
def tournament_type(row):
    if row['type'] != 'Online':
        val = 1
    else:
        val = 0
    return val

df['lan'] = df.apply(tournament_type, axis=1)

# drop initial column
df = df.drop(columns=['type'])

In [21]:
# clean num_teams col
df['num_teams'] = df['num_teams'].str.replace('+', '')

  df['num_teams'] = df['num_teams'].str.replace('+', '')


In [22]:
# drop final unwanted columns
df = df.drop(columns=['region'])

### Cleaning ranks data

In [23]:
ranks.head()

Unnamed: 0,Unnamed: 0.1,team,points,players,date_start
0,0.0,['Liquid'],['(707 points)'],"['nitr0', 'NAF', 'EliGE', 'oSee', 'YEKINDAR']",24/10/2022
1,1.0,['Nouns'],['(39 points)'],"['cynic', 'Bwills', 'cxzi', 'nosraC', 'jeorges...",24/10/2022
2,2.0,['EG White'],['(30 points)'],"['djay', 'Jonji', 'ben1337', 'PwnAlone', 'viz']",24/10/2022
3,3.0,['EG Black'],['(23 points)'],"['RUSH', 'stanislaw', 'Walco', 'wiz', 'chop']",24/10/2022
4,4.0,['Iron Blood'],['(21 points)'],"['droid', 'shane', 'dare', 'snav', 'intra']",24/10/2022


In [24]:
# strip clean valuable columns
ranks['team'] = ranks['team'].str.strip("[]'")
ranks['points'] = ranks['points'].str.strip("[(points)]' ")
# change dtype since its a str
ranks['points'] = ranks['points'].astype(int)

In [25]:
# create week column to match to main df
ranks.date_start=pd.to_datetime(ranks.date_start, format="%d/%m/%Y")
ranks['week'] = ranks.date_start.dt.week
ranks['year'] = ranks.date_start.dt.year

  ranks['week'] = ranks.date_start.dt.week


In [26]:
ranks.dtypes

Unnamed: 0.1           float64
team                    object
points                   int32
players                 object
date_start      datetime64[ns]
week                     int64
year                     int64
dtype: object

In [27]:
# function to pull ranking points
def opp_points(row):
    opponent = row['opp']
    year = row['year']
    week = row['week']
    try:
        val = ranks.loc[(ranks['week'] == week) & (ranks['year'] == year) & (ranks['team'] == opponent)]['points'].values[0]
        return val
    except:
        return None
    
def points_diff(row):
    year = row['year']
    week = row['week']
    try:
        val = ranks.loc[(ranks['week'] == week) & (ranks['year'] == year) & (ranks['team'] == team)]['points'].values[0] - row['opp_points']
        return val
    except:
        return None    

In [28]:
# change dtypes
# df['days_since'] = df['days_since'].astype(int)
df['num_teams'] = df['num_teams'].astype(float)

### Pulling a df for a specific team

In [29]:
# create a function to find all relevant entries
def pred(team):
    df2 = df
    df2 = df2.drop(columns=['date', 'event','day','month'])
    df2['opp'] = None
#     df2['diff'] = None
#     df2['opp_points'] = None
    for i in range(len(df)):
        if df2.loc[i]['team1_name'] == team:
            df2.at[df2.index[i], 'opp'] = df2.loc[i]['team2_name']
            df2.at[df2.index[i], 'diff'] = df2.loc[i]['team1_score'] - df2.loc[i]['team2_score']
        elif df2.loc[i]['team2_name'] == team:
            df2.at[df2.index[i], 'opp'] = df2.loc[i]['team1_name']
            df2.at[df2.index[i], 'diff'] = df2.loc[i]['team2_score'] - df2.loc[i]['team1_score']
            
    # retain only relevant rows
    df2 = df2.dropna()
    df2 = df2.reset_index(drop=True)
    
    # drop null values for final df
    df2['opp_points'] = df2.apply(opp_points, axis=1)
    df2 = df2.dropna()
    
    # drop columns for final df
    df2 = df2.drop(columns=['Unnamed: 0.1', 'team1_name','team1_score','team2_name',
                           'team2_score', 'winner', 'opp'])
    
    # dummies for map feature
    df2 = pd.get_dummies(df2, columns=['map'],  drop_first=True)
    
    return df2    

In [30]:
# test on first entry, entropiq
team = 'Entropiq'
dfx = pred(team)
dfx['points_diff'] = dfx.apply(points_diff, axis=1)
dfx = dfx.drop(columns=['week', 'year'])

In [31]:
dfx

Unnamed: 0,num_teams,prize,prize_qual,premium,lan,diff,opp_points,map_Dust2,map_Mirage,map_Nuke,map_Overpass,map_Vertigo,points_diff
0,24.0,50000.0,0,0,0,-2.0,82.0,0,0,0,0,0,-24.0
1,24.0,50000.0,0,0,0,-2.0,82.0,0,0,0,0,0,-24.0
2,24.0,50000.0,0,0,0,4.0,31.0,0,1,0,0,0,27.0
3,24.0,50000.0,0,0,0,4.0,31.0,0,1,0,0,0,27.0
4,24.0,50000.0,0,0,0,2.0,31.0,1,0,0,0,0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,26.0,120000.0,0,0,0,2.0,94.0,0,0,1,0,0,75.0
297,8.0,0.0,1,0,0,-4.0,104.0,0,0,1,0,0,65.0
298,26.0,120000.0,0,0,0,5.0,94.0,0,0,0,0,0,75.0
299,8.0,0.0,1,0,0,-5.0,104.0,0,0,0,1,0,65.0


In [32]:
dfx.corr()['diff'].sort_values()

opp_points     -0.198094
map_Vertigo    -0.055113
map_Dust2      -0.055113
prize          -0.052838
map_Nuke       -0.025223
lan            -0.013674
map_Overpass    0.001129
premium         0.013735
prize_qual      0.014342
num_teams       0.069483
map_Mirage      0.131536
points_diff     0.209521
diff            1.000000
Name: diff, dtype: float64