In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score

In [2]:
y_train = pd.read_csv('../data/train_winners.csv')

In [3]:
f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = 1091))

data = json.load(f)

**Looking into gold and XP**

In [4]:
df = []

In [5]:
for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    df.append({
        'matchId' : matchId,
        'blueGold' : (
            data['frames'][-1]['participantFrames']['1']['totalGold'] +
            data['frames'][-1]['participantFrames']['2']['totalGold'] +
            data['frames'][-1]['participantFrames']['3']['totalGold'] +
            data['frames'][-1]['participantFrames']['4']['totalGold'] +
            data['frames'][-1]['participantFrames']['5']['totalGold']
        ),
        'blueXP' : (
            data['frames'][-1]['participantFrames']['1']['xp'] +
            data['frames'][-1]['participantFrames']['2']['xp'] +
            data['frames'][-1]['participantFrames']['3']['xp'] +
            data['frames'][-1]['participantFrames']['4']['xp'] +
            data['frames'][-1]['participantFrames']['5']['xp']
        ),
        'redGold' : (
            data['frames'][-1]['participantFrames']['6']['totalGold'] + 
            data['frames'][-1]['participantFrames']['7']['totalGold'] +
            data['frames'][-1]['participantFrames']['8']['totalGold'] +
            data['frames'][-1]['participantFrames']['9']['totalGold'] +
            data['frames'][-1]['participantFrames']['10']['totalGold']
        ),
        'redXP' : (
            data['frames'][-1]['participantFrames']['6']['xp'] + 
            data['frames'][-1]['participantFrames']['7']['xp'] +
            data['frames'][-1]['participantFrames']['8']['xp'] +
            data['frames'][-1]['participantFrames']['9']['xp'] +
            data['frames'][-1]['participantFrames']['10']['xp']
        )
    })

In [6]:
X_train = pd.DataFrame(df)

**Not using difference in gold/xp, and using team totals.**

In [7]:
gold = ['blueGold', 'redGold']
xp = ['blueXP', 'redXP']
both = ['blueGold', 'redGold', 'blueXP', 'redXP']

**Just gold**

In [8]:
logreg = LogisticRegression().fit(X_train[gold], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[gold]))

0.705375

**Just xp**

In [9]:
logreg = LogisticRegression().fit(X_train[xp], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[xp]))

0.672

**Gold/xp - submissionV4**

In [10]:
logreg = LogisticRegression().fit(X_train[both], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[both]))

0.70975

In [11]:
df = []

for matchId in range(8000, 10000):
    f = open('../data/test_timelines/test_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    df.append({
        'matchId' : matchId,
        'blueGold' : (
            data['frames'][-1]['participantFrames']['1']['totalGold'] +
            data['frames'][-1]['participantFrames']['2']['totalGold'] +
            data['frames'][-1]['participantFrames']['3']['totalGold'] +
            data['frames'][-1]['participantFrames']['4']['totalGold'] +
            data['frames'][-1]['participantFrames']['5']['totalGold']
        ),
        'blueXP' : (
            data['frames'][-1]['participantFrames']['1']['xp'] +
            data['frames'][-1]['participantFrames']['2']['xp'] +
            data['frames'][-1]['participantFrames']['3']['xp'] +
            data['frames'][-1]['participantFrames']['4']['xp'] +
            data['frames'][-1]['participantFrames']['5']['xp']
        ),
        'redGold' : (
            data['frames'][-1]['participantFrames']['6']['totalGold'] + 
            data['frames'][-1]['participantFrames']['7']['totalGold'] +
            data['frames'][-1]['participantFrames']['8']['totalGold'] +
            data['frames'][-1]['participantFrames']['9']['totalGold'] +
            data['frames'][-1]['participantFrames']['10']['totalGold']
        ),
        'redXP' : (
            data['frames'][-1]['participantFrames']['6']['xp'] + 
            data['frames'][-1]['participantFrames']['7']['xp'] +
            data['frames'][-1]['participantFrames']['8']['xp'] +
            data['frames'][-1]['participantFrames']['9']['xp'] +
            data['frames'][-1]['participantFrames']['10']['xp']
        )
    })
    
X_test = pd.DataFrame(df)

In [12]:
submissionV4 = X_test[['matchId']]
submissionV4 = submissionV4.copy()
submissionV4['win'] = logreg.predict(X_test[both])
submissionV4['winner'] = np.where(submissionV4['win'] == True, 100, 200)
submissionV4 = submissionV4[['matchId', 'winner']]

In [13]:
#submissionV4.to_csv('../submissions/submissionV4.csv', index = False)

**Using difference in gold/xp, and using team totals**

In [14]:
X_train['goldDifference'] = X_train['blueGold'] - X_train['redGold']

X_train['xpDifference'] = X_train['blueXP'] - X_train['redXP']

gold = ['goldDifference']
xp = ['xpDifference']
both = ['goldDifference', 'xpDifference']

**Just Gold**

In [15]:
logreg = LogisticRegression().fit(X_train[gold], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[gold]))

0.7055

**Just xp**

In [16]:
logreg = LogisticRegression().fit(X_train[xp], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[xp]))

0.6715

**Gold/xp**

In [17]:
logreg = LogisticRegression().fit(X_train[both], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[both]))

0.70825

**Adding kills to gold/xp totals**

In [18]:
df = []

for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    for frame in range(len(data['frames']) - 1, -1, -1):
        for i in range(0, len(data['frames'][frame]['events'])):
            if data['frames'][frame]['events'][i]['type']  == 'CHAMPION_KILL':
                if data['frames'][frame]['events'][i]['killerId'] < 6:
                    df.append({
                        'matchId' : matchId,
                        'blueKills' : 1,
                        'redKills' : 0

                    })
                else:
                    df.append({
                        'matchId' : matchId,
                        'blueKills' : 0,
                        'redKills' : 1

                    })
            else:
                pass

killsTrain = pd.DataFrame(df).groupby('matchId').sum().reset_index()

In [19]:
X_train = X_train.merge(right = killsTrain, on = ['matchId'], how = 'left').fillna(0)

kills = ['blueGold', 'redGold', 'blueXP', 'redXP', 'blueKills', 'redKills']

**Not using difference in gold/xp/kills, and using team totals.**

In [20]:
logreg = LogisticRegression().fit(X_train[kills], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[kills]))

0.70975

**Adding dragons to gold/xp/kills totals**

In [21]:
df = []

for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    for frame in range(len(data['frames']) - 1, -1, -1):
        for i in range(0, len(data['frames'][frame]['events'])):
            if data['frames'][frame]['events'][i]['type']  == 'ELITE_MONSTER_KILL':
                if data['frames'][frame]['events'][i]['monsterType']  == 'DRAGON':
                    if data['frames'][frame]['events'][i]['killerTeamId'] == 100:
                        df.append({
                            'matchId' : matchId,
                            'blueDragons' : 1,
                            'redDragons' : 0
                        })
                    else:
                        df.append({
                            'matchId' : matchId,
                            'blueDragons' : 0,
                            'redDragons' : 1
                        })
                else:
                    pass
            else:
                pass

dragonsTrain = pd.DataFrame(df).groupby('matchId').sum().reset_index()

In [22]:
X_train = X_train.merge(right = dragonsTrain, on = ['matchId'], how = 'left').fillna(0)

dragons = ['blueGold', 'redGold', 'blueXP', 'redXP', 'blueKills', 'redKills', 'blueDragons', 'redDragons']

**Not using difference in gold/xp/kills/dragons, and using team totals.**

In [23]:
logreg = LogisticRegression().fit(X_train[dragons], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[dragons]))

0.70975

**Trying ridge regression using gold/xp/kills/dragons - submissionV5**

In [24]:
pipe = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(max_iter = 10000))
    ]
)

pipe.fit(X_train[dragons], y_train['winner'] == 100)

Pipeline(steps=[('pf',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('vt', VarianceThreshold()), ('scaler', StandardScaler()),
                ('logistic', LogisticRegression(max_iter=10000))])

In [25]:
accuracy_score(y_train['winner'] == 100, pipe.predict(X_train[dragons]))

0.71925

In [26]:
df = []

for matchId in range(8000, 10000):
    f = open('../data/test_timelines/test_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    for frame in range(len(data['frames']) - 1, -1, -1):
        for i in range(0, len(data['frames'][frame]['events'])):
            if data['frames'][frame]['events'][i]['type']  == 'CHAMPION_KILL':
                if data['frames'][frame]['events'][i]['killerId'] < 6:
                    df.append({
                        'matchId' : matchId,
                        'blueKills' : 1,
                        'redKills' : 0

                    })
                else:
                    df.append({
                        'matchId' : matchId,
                        'blueKills' : 0,
                        'redKills' : 1

                    })
            else:
                pass

killsTest = pd.DataFrame(df).groupby('matchId').sum().reset_index()

In [27]:
X_test = X_test.merge(right = killsTest, on = ['matchId'], how = 'left').fillna(0)

In [28]:
df = []

for matchId in range(8000, 10000):
    f = open('../data/test_timelines/test_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    for frame in range(len(data['frames']) - 1, -1, -1):
        for i in range(0, len(data['frames'][frame]['events'])):
            if data['frames'][frame]['events'][i]['type']  == 'ELITE_MONSTER_KILL':
                if data['frames'][frame]['events'][i]['monsterType']  == 'DRAGON':
                    if data['frames'][frame]['events'][i]['killerTeamId'] == 100:
                        df.append({
                            'matchId' : matchId,
                            'blueDragons' : 1,
                            'redDragons' : 0
                        })
                    else:
                        df.append({
                            'matchId' : matchId,
                            'blueDragons' : 0,
                            'redDragons' : 1
                        })
                else:
                    pass
            else:
                pass

dragonsTest = pd.DataFrame(df).groupby('matchId').sum().reset_index()

In [29]:
X_test = X_test.merge(right = dragonsTest, on = ['matchId'], how = 'left').fillna(0)

In [30]:
submissionV5 = X_test[['matchId']]
submissionV5 = submissionV5.copy()
submissionV5['win'] = pipe.predict(X_test[dragons])
submissionV5['winner'] = np.where(submissionV5['win'] == True, 100, 200)
submissionV5 = submissionV5[['matchId', 'winner']]

In [31]:
#submissionV5.to_csv('../submissions/submissionV5.csv', index = False)

**Adding champion points to gold/xp/kills/dragons**

In [32]:
champMastery = pd.read_csv('../data/champion_mastery.csv')
participantsTrain = pd.read_csv('../data/participants_train.csv')
participantsTest = pd.read_csv('../data/participants_test.csv')

In [33]:
champPoints_df = (
    participantsTrain[['matchId', 'teamId', 'summonerId', 'championId']]
    .merge(
        right = champMastery[['summonerId', 'championId', 'championPoints']],
        how = 'left',
        on = ['summonerId', 'championId']
    )
    .fillna(0)[['matchId', 'teamId', 'championPoints']]
    .groupby(['matchId', 'teamId']).sum().reset_index()
)

blueChampPoints = (
    champPoints_df.loc[champPoints_df['teamId'] == 100][['matchId', 'championPoints']]
    .rename(columns = {'championPoints' : 'blueChampPoints'})
)

redChampPoints = (
    champPoints_df.loc[champPoints_df['teamId'] == 200][['matchId', 'championPoints']]
    .rename(columns = {'championPoints' : 'redChampPoints'})
)

X_train = X_train.merge(
    right = blueChampPoints,
    how = 'left',
    on = 'matchId'
).merge(
    right = redChampPoints,
    how = 'left',
    on = 'matchId'
)

**Trying normal/ridge regression using gold/xp/kills/dragons/champion points - submissionV7**

In [34]:
champPoints = [
    'blueGold', 'redGold', 
    'blueXP', 'redXP', 
    'blueKills', 'redKills', 
    'blueDragons', 'redDragons',
    'blueChampPoints', 'redChampPoints'
]

In [35]:
logreg = LogisticRegression().fit(X_train[champPoints], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[champPoints]))

0.707375

In [36]:
pipe = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(max_iter = 10000))
    ]
)

pipe.fit(X_train[champPoints], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, pipe.predict(X_train[champPoints]))

0.72125

In [37]:
champPoints_df = (
    participantsTest[['matchId', 'teamId', 'summonerId', 'championId']]
    .merge(
        right = champMastery[['summonerId', 'championId', 'championPoints']],
        how = 'left',
        on = ['summonerId', 'championId']
    )
    .fillna(0)[['matchId', 'teamId', 'championPoints']]
    .groupby(['matchId', 'teamId']).sum().reset_index()
)

blueChampPoints = (
    champPoints_df.loc[champPoints_df['teamId'] == 100][['matchId', 'championPoints']]
    .rename(columns = {'championPoints' : 'blueChampPoints'})
)

redChampPoints = (
    champPoints_df.loc[champPoints_df['teamId'] == 200][['matchId', 'championPoints']]
    .rename(columns = {'championPoints' : 'redChampPoints'})
)

X_test = X_test.merge(
    right = blueChampPoints,
    how = 'left',
    on = 'matchId'
).merge(
    right = redChampPoints,
    how = 'left',
    on = 'matchId'
)

In [38]:
submissionV7 = X_test[['matchId']]
submissionV7 = submissionV7.copy()
submissionV7['win'] = pipe.predict(X_test[champPoints])
submissionV7['winner'] = np.where(submissionV7['win'] == True, 100, 200)
submissionV7 = submissionV7[['matchId', 'winner']]

In [39]:
#submissionV7.to_csv('../submissions/submissionV7.csv', index = False)

**Adding summoner level to gold/xp/kills/dragons/champion points**

In [40]:
sumLevel_df = (
    participantsTrain[['matchId', 'teamId', 'summonerLevel']]
    .groupby(['matchId', 'teamId']).sum().reset_index()
)

blueSumLevel = (
    sumLevel_df.loc[sumLevel_df['teamId'] == 100][['matchId', 'summonerLevel']]
    .rename(columns = {'summonerLevel' : 'blueSumLevel'})
)

redSumLevel = (
    sumLevel_df.loc[sumLevel_df['teamId'] == 200][['matchId', 'summonerLevel']]
    .rename(columns = {'summonerLevel' : 'redSumLevel'})
)

X_train = (
    X_train.merge(
        right = blueSumLevel,
        how = 'left',
        on = 'matchId'
    ).merge(
        right = redSumLevel,
        how = 'left',
        on = 'matchId'
    )
)

**Trying normal/ridge regression using gold/xp/kills/dragons/champion points/summoner level**

In [41]:
sumLevel = [
    'blueGold', 'redGold', 
    'blueXP', 'redXP', 
    'blueKills', 'redKills', 
    'blueDragons', 'redDragons',
    'blueChampPoints', 'redChampPoints',
    'blueSumLevel', 'redSumLevel'
]

In [42]:
logreg = LogisticRegression().fit(X_train[sumLevel], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[sumLevel]))

0.70775

In [43]:
pipe = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(max_iter = 10000))
    ]
)

pipe.fit(X_train[sumLevel], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, pipe.predict(X_train[sumLevel]))

0.718

**The best model seems to be using ridge regression with gold/xp/kills/dragons/champion points as predictors. Now to see if adding interaction terms can help improve the model.**

In [44]:
X_train['goldInteraction'] = X_train['blueGold'] * X_train['redGold']
X_train['xpInteraction'] = X_train['blueXP'] * X_train['redXP']
X_train['killsInteraction'] = X_train['blueKills'] * X_train['redKills']
X_train['dragonsInteraction'] = X_train['blueDragons'] * X_train['redDragons']
X_train['champPointsInteraction'] = X_train['blueChampPoints'] * X_train['redChampPoints']

**Trying normal/ridge regression using gold/xp/kills/dragons/champion points plus interaction terms**

In [45]:
interactions = [
    'blueGold', 'redGold', 'goldInteraction',
    'blueXP', 'redXP', 'xpInteraction',
    'blueKills', 'redKills', 'killsInteraction',
    'blueDragons', 'redDragons', 'dragonsInteraction',
    'blueChampPoints', 'redChampPoints', 'champPointsInteraction'
]

In [46]:
logreg = LogisticRegression().fit(X_train[interactions], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[interactions]))

0.491125

In [47]:
pipe = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(max_iter = 10000))
    ]
)

pipe.fit(X_train[interactions], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, pipe.predict(X_train[interactions]))

0.721125

**Trying normal/ridge regression using gold/xp/kills/dragons plus interaction terms**

In [48]:
interactionsV2 = [
    'blueGold', 'redGold', 'goldInteraction',
    'blueXP', 'redXP', 'xpInteraction',
    'blueKills', 'redKills', 'killsInteraction',
    'blueDragons', 'redDragons'
]

In [49]:
logreg = LogisticRegression().fit(X_train[interactionsV2], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, logreg.predict(X_train[interactionsV2]))

0.508875

In [50]:
pipe = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(max_iter = 10000))
    ]
)

pipe.fit(X_train[interactionsV2], y_train['winner'] == 100)

accuracy_score(y_train['winner'] == 100, pipe.predict(X_train[interactionsV2]))

0.721625

**Seeing if a simpler model may be better using lassos regression**

In [51]:
pipe = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(C = .5, penalty = 'l1', solver = 'saga', max_iter = 10000))
    ]
)

pipe.fit(X_train[interactions], y_train['winner'] == 100)

Pipeline(steps=[('pf',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('vt', VarianceThreshold()), ('scaler', StandardScaler()),
                ('logistic',
                 LogisticRegression(C=0.5, max_iter=10000, penalty='l1',
                                    solver='saga'))])

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
# gs = GridSearchCV(estimator = pipe, 
#                  param_grid = {'logistic__C': [1, 0.5, 0.1, 0.05, 0.01]},
#                  scoring = 'accuracy')

In [54]:
#gs.fit(X_train[interactions], y_train['winner'] == 100)

In [55]:
#gs.best_params_

#best parma is c = 0.5

In [56]:
accuracy_score(y_train['winner'] == 100, pipe.predict(X_train[interactions]))

0.720625

**Accounting for games that ended prior to the 10 minute mark using gold/xp/kills/dragons as predictors and ridge regression - submissionV8**

In [57]:
df = []

for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTen_df = pd.DataFrame(df)

In [58]:
underTen = underTen_df['matchId']

X_trainOverTen = X_train.loc[~X_train['matchId'].isin(underTen)]

y_trainOverTen = y_train.loc[~y_train['matchId'].isin(underTen)]

In [59]:
pipe = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(max_iter = 10000))
    ]
)

pipe.fit(X_trainOverTen[dragons], y_trainOverTen['winner'] == 100)

Pipeline(steps=[('pf',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('vt', VarianceThreshold()), ('scaler', StandardScaler()),
                ('logistic', LogisticRegression(max_iter=10000))])

In [60]:
predicitions = X_trainOverTen[['matchId']].copy()
predicitions['win'] = pipe.predict(X_trainOverTen[dragons])
predicitions['winner'] = np.where(predicitions['win'] == True, 100, 200)
predicitions = predicitions[['matchId', 'winner']]

In [61]:
accuracy_score(
    y_train['winner'] == 100, 
    pd.concat([predicitions, underTen_df]).sort_values(by = 'matchId')['winner'] == 100
)

0.722125

In [62]:
df = []

for matchId in range(8000, 10000):
    f = open('../data/test_timelines/test_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTen_df = pd.DataFrame(df)

In [63]:
X_testOverTen = X_test.loc[~X_test['matchId'].isin(underTen_df['matchId'])]

In [64]:
predicitions = X_testOverTen[['matchId']].copy()
predicitions['win'] = pipe.predict(X_testOverTen[dragons])
predicitions['winner'] = np.where(predicitions['win'] == True, 100, 200)
predicitions = predicitions[['matchId', 'winner']]

In [65]:
submissionV8 = pd.concat([predicitions, underTen_df]).sort_values(by = 'matchId')

In [66]:
#submissionV8.to_csv('../submissions/submissionV8.csv', index = False)

**submissionV8 seemed to be the best model. Now seeing if spiliting it out by lane has any impact on the model.**

# Random Forest

In [74]:
df = []

for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTen_df = pd.DataFrame(df)

In [67]:
from sklearn.ensemble import RandomForestClassifier

In [76]:
clf = RandomForestClassifier(max_depth=4, random_state=0)

In [77]:
clf.fit(X_trainOverTen[dragons], y_trainOverTen['winner'] == 100)

RandomForestClassifier(max_depth=4, random_state=0)

In [78]:
pred_clf = X_trainOverTen[['matchId']].copy()
pred_clf['win'] = clf.predict(X_trainOverTen[dragons])
pred_clf['winner'] = np.where(pred_clf['win'] == True, 100, 200)
pred_clf = pred_clf[['matchId', 'winner']]

In [79]:
accuracy_score(
    y_train['winner'] == 100, 
    pd.concat([pred_clf, underTen_df]).sort_values(by = 'matchId')['winner'] == 100
)

0.7265

### Random Forest Submission

In [80]:
df = []

for matchId in range(8000, 10000):
    f = open('../data/test_timelines/test_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTen_df = pd.DataFrame(df)

X_testOverTen = X_test.loc[~X_test['matchId'].isin(underTen_df['matchId'])]

predicitions = X_testOverTen[['matchId']].copy()
predicitions['win'] = clf.predict(X_testOverTen[dragons])
predicitions['winner'] = np.where(predicitions['win'] == True, 100, 200)
predicitions = predicitions[['matchId', 'winner']]

submission_v1 = pd.concat([predicitions, underTen_df]).sort_values(by = 'matchId')

In [82]:
#submission_v1.to_csv('../submissions/submission_v1.csv', index = False)

### random forest model overtrained, need to tweak other hyperparameters

# Neutral Network

In [83]:
from sklearn.neural_network import MLPClassifier

In [88]:
df = []

for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTen_df = pd.DataFrame(df)

In [106]:
# Create a neural network classifier with one hidden layer
clf_1 = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam')

# Train the classifier on some data X and labels y
clf_1.fit(X_trainOverTen[dragons], y_trainOverTen['winner'] == 100)

# Use the classifier to predict the labels of some new data X_test
#y_pred = clf.predict(X_test)
pred_clf_1 = X_trainOverTen[['matchId']].copy()
pred_clf_1['win'] = clf_1.predict(X_trainOverTen[dragons])
pred_clf_1['winner'] = np.where(pred_clf_1['win'] == True, 100, 200)
pred_clf_1 = pred_clf_1[['matchId', 'winner']]

In [107]:
accuracy_score(
    y_train['winner'] == 100, 
    pd.concat([pred_clf_1, underTen_df]).sort_values(by = 'matchId')['winner'] == 100
)

0.708

# Gradient Boosting

In [130]:
from sklearn.ensemble import GradientBoostingClassifier

In [131]:
df = []

for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTen_df = pd.DataFrame(df)

In [132]:
# Create a Gradient Boosting classifier
clf_2 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

# Train the classifier on some data X and labels y
clf_2.fit(X_trainOverTen[dragons], y_trainOverTen['winner'] == 100)

# Use the classifier to predict the labels of some new data X_test
#y_pred = clf.predict(X_test)
pred_clf_2 = X_trainOverTen[['matchId']].copy()
pred_clf_2['win'] = clf_2.predict(X_trainOverTen[dragons])
pred_clf_2['winner'] = np.where(pred_clf_2['win'] == True, 100, 200)
pred_clf_2 = pred_clf_2[['matchId', 'winner']]

In [133]:
accuracy_score(
    y_train['winner'] == 100, 
    pd.concat([pred_clf_2, underTen_df]).sort_values(by = 'matchId')['winner'] == 100
)

0.728125

### Gradient Boosting Submission

In [134]:
df = []

for matchId in range(8000, 10000):
    f = open('../data/test_timelines/test_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTenTest_df = pd.DataFrame(df)

In [135]:
X_testOverTen = X_test.loc[~X_test['matchId'].isin(underTenTest_df['matchId'])]

In [136]:
predicitions = X_testOverTen[['matchId']].copy()
predicitions['win'] = clf_2.predict(X_testOverTen[dragons])
predicitions['winner'] = np.where(predicitions['win'] == True, 100, 200)
predicitions = predicitions[['matchId', 'winner']]

In [137]:
submissionV8 = pd.concat([predicitions, underTenTest_df]).sort_values(by = 'matchId')

In [139]:
submissionV8.shape

(2000, 2)

In [140]:
#submissionV8.to_csv('../submissions/submissionV8.csv', index = False)

# K-Nearest Neighbors

In [145]:
from sklearn.neighbors import KNeighborsClassifier

In [146]:
df = []

for matchId in range(0, 8000):
    f = open('../data/train_timelines/train_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTen_df = pd.DataFrame(df)

In [148]:
# Create a K-nearest neighbor classifier
clf_3 = KNeighborsClassifier(n_neighbors=5)

# Train the classifier on some data X and labels y
clf_3.fit(X_trainOverTen[dragons], y_trainOverTen['winner'] == 100)

# Use the classifier to predict the labels of some new data X_test
#y_pred = clf.predict(X_test)
pred_clf_3 = X_trainOverTen[['matchId']].copy()
pred_clf_3['win'] = clf_3.predict(X_trainOverTen[dragons])
pred_clf_3['winner'] = np.where(pred_clf_3['win'] == True, 100, 200)
pred_clf_3 = pred_clf_3[['matchId', 'winner']]

In [149]:
accuracy_score(
    y_train['winner'] == 100, 
    pd.concat([pred_clf_3, underTen_df]).sort_values(by = 'matchId')['winner'] == 100
)

0.77275

### KNN submission

In [150]:
df = []

for matchId in range(8000, 10000):
    f = open('../data/test_timelines/test_timelines/timeline_{matchIds}.json'.format(matchIds = matchId))
    
    data = json.load(f)
    if len(data['frames']) != 11:
        df.append({
            'matchId': matchId,
            'winner' : data['frames'][-1]['events'][-1]['winningTeam']
                  })
    else:
        pass
    
underTenTest_df = pd.DataFrame(df)

In [151]:
X_testOverTen = X_test.loc[~X_test['matchId'].isin(underTenTest_df['matchId'])]

In [152]:
predicitions = X_testOverTen[['matchId']].copy()
predicitions['win'] = clf_3.predict(X_testOverTen[dragons])
predicitions['winner'] = np.where(predicitions['win'] == True, 100, 200)
predicitions = predicitions[['matchId', 'winner']]

In [153]:
submissionV9 = pd.concat([predicitions, underTenTest_df]).sort_values(by = 'matchId')

In [154]:
submissionV9.shape

(2000, 2)

In [155]:
#submissionV9.to_csv('../submissions/submissionV9.csv', index = False)

# Web Scap from Metasrc

In [166]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url_base = 'https://www.metasrc.com/5v5/na/{}/stats?ranks=diamond'

patches = ['13.1', '13.2', '13.4', '13.5']
dfs = []

for patch in patches:
    url = url_base.format(patch)
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', {'class': 'stats-table'}) # finds the table element in the HTML code with class name 'stats-table'
    if table:
        headers = [] # creates an empty list called headers to store the column headers of the stats table
        for th in table.find_all('th'): # loops through all the <th> elements aka column headers in the table and extracts their text using the text attribute
            headers.append(th.text.strip()) # appends the text of each column header to the headers list

        rows = []
        for tr in table.find_all('tr'): # loops through all the <tr> elements aka table rows in the table and extracts their <td> elements aka table cells
            row = []
            for td in tr.find_all('td'):
                row.append(td.text.strip()) # appends the text of each cell to the `row list`
            if row:
                rows.append(row) 
                
        df = pd.DataFrame(rows, columns=headers)
        df['patch'] = patch
        dfs.append(df)
    else:
        print('Table not found for patch {}'.format(patch))

metasrc = pd.concat(dfs)

In [167]:
metasrc.head()

Unnamed: 0,Name,Role,Tier,Score,Trend,Win %,Role %,Pick %,Ban %,KDA,patch
0,AatroxAatrox,TOP,Strong / S,57.89,-28.16,49.40%,91.11%,6.69%,10.95%,1.9,13.1
1,AhriAhri,MID,Strong / S,54.77,1.21,50.23%,94.34%,5.19%,1.06%,2.88,13.1
2,AkaliAkali,MID,Good / A,54.64,-3.7,47.22%,79.69%,6.64%,7.10%,2.49,13.1
3,AkaliAkali,TOP,Weak / C,40.62,3.48,45.87%,20.20%,1.87%,7.10%,2.07,13.1
4,AkshanAkshan,MID,Good / A,50.65,-0.02,52.45%,70.39%,3.00%,4.05%,2.35,13.1


In [168]:
#metasrc.to_csv('../data/metasrc.csv', index = False)

In [164]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.metasrc.com/5v5/na/12.23/stats?ranks=diamond'
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

tables = soup.find_all('table')
for table in tables:
    print(table.get('class'))


['stats-table', '_fizomx']
