In [13]:
#Read in Libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os, json
import glob
from pandas.io.json import json_normalize
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [14]:
#Read in Datasets
X_train = pd.read_csv('../kaggle_lol-koopas/participants_train.csv')
X_test = pd.read_csv('../kaggle_lol-koopas/participants_test.csv')
y_train = pd.read_csv('../kaggle_lol-koopas/train_winners.csv')
champ_mastery = pd.read_csv('../kaggle_lol-koopas/champion_mastery.csv')

In [15]:
#Checkout the X_train set
X_train.head()

Unnamed: 0,matchId,teamId,participantId,summonerId,summonerLevel,championName,championId
0,0,100,1,0,303,Mordekaiser,82
1,0,100,2,1,616,Sylas,517
2,0,100,3,2,667,Lissandra,127
3,0,100,4,3,860,Caitlyn,51
4,0,100,5,4,325,Morgana,25


In [16]:
#merge with the champion_mastery data
X_train = X_train.merge(
    right = champ_mastery[['summonerId', 'championId', 'championPoints']], 
    how = 'left', 
    on = ['summonerId', 'championId'])

In [17]:
#Checkout the X_train dataset
X_train

Unnamed: 0,matchId,teamId,participantId,summonerId,summonerLevel,championName,championId,championPoints
0,0,100,1,0,303,Mordekaiser,82,61890.0
1,0,100,2,1,616,Sylas,517,37664.0
2,0,100,3,2,667,Lissandra,127,40303.0
3,0,100,4,3,860,Caitlyn,51,96304.0
4,0,100,5,4,325,Morgana,25,608721.0
...,...,...,...,...,...,...,...,...
79995,7999,200,6,13979,595,Yorick,83,48037.0
79996,7999,200,7,39643,38,Volibear,106,135.0
79997,7999,200,8,5570,498,Anivia,34,49027.0
79998,7999,200,9,10228,733,Twitch,29,174089.0


In [18]:
#Fill in Null values with 0
X_train['championPoints'] = X_train['championPoints'].fillna(0)

In [19]:
#drop duplicates in the matchId column and list the unique values in a container
matchIds = X_train['matchId'].drop_duplicates().reset_index(drop = True).values.tolist()

In [20]:
#Run a for loop that finds the difference in points 
df = []

for matchId in tqdm(matchIds):
    game = X_train.loc[X_train['matchId'] == matchId].reset_index(drop = True)
    
    df.append({
        'matchId' : game['matchId'][0],
#         'top_sumLevel_diff' : (
#             game.loc[game['participantId'] == 1]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 6]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'top_champPoints_diff' : (
            game.loc[game['participantId'] == 1]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 6]['championPoints'].reset_index(drop = True)[0]
        ),
#         'jungle_sumLevel_diff' : (
#             game.loc[game['participantId'] == 2]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 7]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'jungle_champPoints_diff' : (
            game.loc[game['participantId'] == 2]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 7]['championPoints'].reset_index(drop = True)[0]
        ),
#         'mid_sumLevel_diff' : (
#             game.loc[game['participantId'] == 3]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 8]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'mid_champPoints_diff' : (
            game.loc[game['participantId'] == 3]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 8]['championPoints'].reset_index(drop = True)[0]
        ),
#         'bot_sumLevel_diff' : (
#             game.loc[game['participantId'] == 4]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 9]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'bot_champPoints_diff' : (
            game.loc[game['participantId'] == 4]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 9]['championPoints'].reset_index(drop = True)[0]
        ),
#         'support_sumLevel_diff' : (
#             game.loc[game['participantId'] == 5]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 10]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'support_champPoints_diff' : (
            game.loc[game['participantId'] == 5]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 10]['championPoints'].reset_index(drop = True)[0]
        )
    })

100%|█████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:35<00:00, 228.45it/s]


In [21]:
#checkout the dataframe
df

[{'matchId': 0,
  'top_champPoints_diff': -1118162.0,
  'jungle_champPoints_diff': -7551.0,
  'mid_champPoints_diff': -78528.0,
  'bot_champPoints_diff': 79673.0,
  'support_champPoints_diff': 519140.0},
 {'matchId': 1,
  'top_champPoints_diff': -372300.0,
  'jungle_champPoints_diff': 466745.0,
  'mid_champPoints_diff': 588441.0,
  'bot_champPoints_diff': 136655.0,
  'support_champPoints_diff': 536486.0},
 {'matchId': 2,
  'top_champPoints_diff': -430585.0,
  'jungle_champPoints_diff': -131802.0,
  'mid_champPoints_diff': 26703.0,
  'bot_champPoints_diff': 262803.0,
  'support_champPoints_diff': -1030.0},
 {'matchId': 3,
  'top_champPoints_diff': -136459.0,
  'jungle_champPoints_diff': -129684.0,
  'mid_champPoints_diff': -502548.0,
  'bot_champPoints_diff': 44732.0,
  'support_champPoints_diff': 436292.0},
 {'matchId': 4,
  'top_champPoints_diff': 179921.0,
  'jungle_champPoints_diff': 45568.0,
  'mid_champPoints_diff': 9711.0,
  'bot_champPoints_diff': 130362.0,
  'support_champPoint

In [22]:
#read df into dataframe and call 'X_train'
X_train = pd.DataFrame(df)

In [23]:
#convert '100' to winner
y_train = y_train['winner'] == 100

In [24]:
#read in multiple files from JSON folder
from tqdm.notebook import tqdm

directory = 'E:/Data Science/NSS_Projects/kaggle_lol-koopas/train_timelines/train_timelines'

df_list = []
participants = range(1,11)
total_gold = []

for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.json'):
        file_path = os.path.join(directory, filename)
        with open(file_path) as json_file:
            data = json.load(json_file)
        for participant in participants:
            total_gold.append((data['matchId'],
                              data['frames'][-1].get('participantFrames')[str(participant)].get('participantId'), 
                              data['frames'][-1].get('participantFrames')[str(participant)].get('totalGold')))

  0%|          | 0/8000 [00:00<?, ?it/s]

In [25]:
#rename column headers and apply Team labels
total_gold = pd.DataFrame(total_gold, columns = ['matchId', 'participantId', 'totalGold'])
total_gold['teamId'] = total_gold['participantId'].apply(lambda x: '100' if x >= 1 and x <= 5 else '200')

In [26]:
#sum totalGold, reset index, and sort columns by matchId and totalGold
team_totals_by_match = total_gold.groupby(['matchId', 'teamId']).sum('totalGold')
team_totals = team_totals_by_match.reset_index()
team_totals = team_totals.sort_values(['matchId', 'totalGold'], ascending=[True, False])

In [27]:
#get minimum and maximum totalGold by match and reset index
match_gold = team_totals.groupby('matchId')['totalGold'].agg(['max', 'min']).reset_index()

In [28]:
#merge the two dataframes together
team_gold_diff = pd.merge(team_totals, match_gold, on='matchId', how='left')

In [29]:
#get the difference between totalGold at the 10th minute mark
team_gold_diff['teamGoldDiff'] = np.where(team_gold_diff['totalGold'] == team_gold_diff['max'], 
                                          team_gold_diff['max'] - team_gold_diff['min'], 
                                          team_gold_diff['min'] - team_gold_diff['max'])

team_gold_diff

Unnamed: 0,matchId,teamId,participantId,totalGold,max,min,teamGoldDiff
0,0,200,40,17539,17539,14146,3393
1,0,100,15,14146,17539,14146,-3393
2,1,100,15,17662,17662,14816,2846
3,1,200,40,14816,17662,14816,-2846
4,2,200,40,17347,17347,16374,973
...,...,...,...,...,...,...,...
15995,7997,200,40,16571,17745,16571,-1174
15996,7998,200,40,18311,18311,15543,2768
15997,7998,100,15,15543,18311,15543,-2768
15998,7999,100,15,17058,17058,14892,2166


In [30]:
#convert the TeamId column to integers, since we set this up as strings
team_gold_diff['teamId'] = team_gold_diff['teamId'].astype('int64')
team_gold_diff

Unnamed: 0,matchId,teamId,participantId,totalGold,max,min,teamGoldDiff
0,0,200,40,17539,17539,14146,3393
1,0,100,15,14146,17539,14146,-3393
2,1,100,15,17662,17662,14816,2846
3,1,200,40,14816,17662,14816,-2846
4,2,200,40,17347,17347,16374,973
...,...,...,...,...,...,...,...
15995,7997,200,40,16571,17745,16571,-1174
15996,7998,200,40,18311,18311,15543,2768
15997,7998,100,15,15543,18311,15543,-2768
15998,7999,100,15,17058,17058,14892,2166


In [31]:
#locate all non-zero team gold differentials
positive_differential = team_gold_diff[team_gold_diff['teamGoldDiff'] >= 0].sort_values('teamGoldDiff', ascending = False)

In [32]:
#reset index
positive_differential = positive_differential.reset_index()
positive_differential

Unnamed: 0,index,matchId,teamId,participantId,totalGold,max,min,teamGoldDiff
0,4910,2455,200,40,24324,24324,10880,13444
1,4554,2277,200,40,25228,25228,15026,10202
2,5154,2577,100,15,23923,23923,14056,9867
3,486,243,100,15,22995,22995,13563,9432
4,10514,5257,100,15,22136,22136,13091,9045
...,...,...,...,...,...,...,...,...
8010,11740,5870,100,15,2660,2660,2660,0
8011,2734,1367,100,15,2500,2500,2500,0
8012,10437,5218,200,40,17027,17027,17027,0
8013,6789,3394,200,40,2590,2590,2590,0


In [33]:
#grab the first 8000 rows
positive_differential = positive_differential[:8000]

In [34]:
#look at current X_train to read in proper headers for merge
X_train.head()

Unnamed: 0,matchId,top_champPoints_diff,jungle_champPoints_diff,mid_champPoints_diff,bot_champPoints_diff,support_champPoints_diff
0,0,-1118162.0,-7551.0,-78528.0,79673.0,519140.0
1,1,-372300.0,466745.0,588441.0,136655.0,536486.0
2,2,-430585.0,-131802.0,26703.0,262803.0,-1030.0
3,3,-136459.0,-129684.0,-502548.0,44732.0,436292.0
4,4,179921.0,45568.0,9711.0,130362.0,138106.0


In [35]:
#merge the teamGoldDifferential to the previously setup X_train dataset, that contains other point differentials
X_train = pd.merge(X_train[['matchId', 'top_champPoints_diff', 'jungle_champPoints_diff', 'mid_champPoints_diff', 'bot_champPoints_diff', 'support_champPoints_diff']], positive_differential[['matchId', 'teamGoldDiff']], on = ['matchId'], how = 'left')

In [36]:
#checking out duplicate values in the code
duplicates = X_train[X_train.duplicated(['matchId'], keep = False)]
duplicates

Unnamed: 0,matchId,top_champPoints_diff,jungle_champPoints_diff,mid_champPoints_diff,bot_champPoints_diff,support_champPoints_diff,teamGoldDiff
416,416,-1470211.0,-41598.0,556957.0,27623.0,-145983.0,0.0
417,416,-1470211.0,-41598.0,556957.0,27623.0,-145983.0,0.0
2775,2774,18026.0,-318562.0,-4114.0,277203.0,-80615.0,0.0
2776,2774,18026.0,-318562.0,-4114.0,277203.0,-80615.0,0.0
5738,5736,-480111.0,-79710.0,-22534.0,62561.0,-32228.0,0.0
5739,5736,-480111.0,-79710.0,-22534.0,62561.0,-32228.0,0.0
6817,6814,17333.0,-726936.0,-810472.0,-1836789.0,-56965.0,0.0
6818,6814,17333.0,-726936.0,-810472.0,-1836789.0,-56965.0,0.0


In [37]:
#removing duplicates and resetting the index
X_train = X_train.drop_duplicates()
X_train = X_train.reset_index()

In [38]:
#dropping the 'index' column
X_train = X_train.drop(columns = ['index'])

In [39]:
#locate the Nan Values
nan_locs = X_train.isna()
print(nan_locs)

      matchId  top_champPoints_diff  jungle_champPoints_diff  \
0       False                 False                    False   
1       False                 False                    False   
2       False                 False                    False   
3       False                 False                    False   
4       False                 False                    False   
...       ...                   ...                      ...   
7995    False                 False                    False   
7996    False                 False                    False   
7997    False                 False                    False   
7998    False                 False                    False   
7999    False                 False                    False   

      mid_champPoints_diff  bot_champPoints_diff  support_champPoints_diff  \
0                    False                 False                     False   
1                    False                 False                     False 

In [40]:
#fill duplicates with 0
X_train = X_train.fillna(0)

In [41]:
#Run Logistic Regression to fit the Train data against the y_train data
logreg = LogisticRegression().fit(X_train.drop(columns = ['matchId']), y_train)

In [42]:
accuracy_score(y_train, logreg.predict(X_train.drop(columns = ['matchId'])))

0.52575

In [50]:
logreg.coef_

array([[6.50905928e-08, 3.85082580e-08, 1.01039633e-07, 3.76286580e-08,
        2.43689222e-08, 1.70562774e-05]])

In [52]:
X_test = X_test.merge(
    right = champ_mastery[['summonerId', 'championId', 'championPoints']], 
    how = 'left', 
    on = ['summonerId', 'championId'])

In [53]:
X_test['championPoints'] = X_test['championPoints'].fillna(0)

In [54]:
matchIds = X_test['matchId'].drop_duplicates().reset_index(drop = True).values.tolist()

In [55]:
df = []

for matchId in tqdm(matchIds):
    game = X_test.loc[X_test['matchId'] == matchId].reset_index(drop = True)
    
    df.append({
        'matchId' : game['matchId'][0],
#         'top_sumLevel_diff' : (
#             game.loc[game['participantId'] == 1]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 6]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'top_champPoints_diff' : (
            game.loc[game['participantId'] == 1]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 6]['championPoints'].reset_index(drop = True)[0]
        ),
#         'jungle_sumLevel_diff' : (
#             game.loc[game['participantId'] == 2]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 7]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'jungle_champPoints_diff' : (
            game.loc[game['participantId'] == 2]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 7]['championPoints'].reset_index(drop = True)[0]
        ),
#         'mid_sumLevel_diff' : (
#             game.loc[game['participantId'] == 3]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 8]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'mid_champPoints_diff' : (
            game.loc[game['participantId'] == 3]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 8]['championPoints'].reset_index(drop = True)[0]
        ),
#         'bot_sumLevel_diff' : (
#             game.loc[game['participantId'] == 4]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 9]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'bot_champPoints_diff' : (
            game.loc[game['participantId'] == 4]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 9]['championPoints'].reset_index(drop = True)[0]
        ),
#         'support_sumLevel_diff' : (
#             game.loc[game['participantId'] == 5]['summonerLevel'].reset_index(drop = True)[0] - 
#             game.loc[game['participantId'] == 10]['summonerLevel'].reset_index(drop = True)[0]
#         ),
        'support_champPoints_diff' : (
            game.loc[game['participantId'] == 5]['championPoints'].reset_index(drop = True)[0] - 
            game.loc[game['participantId'] == 10]['championPoints'].reset_index(drop = True)[0]
        )
    })

  0%|          | 0/2000 [00:00<?, ?it/s]

In [56]:
X_test = pd.DataFrame(df)

In [58]:
#read in multiple files from JSON folder
from tqdm.notebook import tqdm

directory = 'E:/Data Science/NSS_Projects/kaggle_lol-koopas/test_timelines/test_timelines'

df_list = []
participants = range(1,11)
total_gold = []

for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.json'):
        file_path = os.path.join(directory, filename)
        with open(file_path) as json_file:
            data = json.load(json_file)
        for participant in participants:
            total_gold.append((data['matchId'],
                              data['frames'][-1].get('participantFrames')[str(participant)].get('participantId'), 
                              data['frames'][-1].get('participantFrames')[str(participant)].get('totalGold')))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [59]:
total_gold_test = pd.DataFrame(total_gold, columns = ['matchId', 'participantId', 'totalGold'])
total_gold_test['teamId'] = total_gold_test['participantId'].apply(lambda x: '100' if x >= 1 and x <= 5 else '200')

In [60]:
#sum totalGold, reset index, and sort columns by matchId and totalGold
team_totals_by_match_test = total_gold_test.groupby(['matchId', 'teamId']).sum('totalGold')
team_totals_test = team_totals_by_match_test.reset_index()
team_totals_test = team_totals_test.sort_values(['matchId', 'totalGold'], ascending=[True, False])

In [61]:
#get minimum and maximum totalGold by match and reset index
match_gold_test = team_totals_test.groupby('matchId')['totalGold'].agg(['max', 'min']).reset_index()

In [63]:
#merge the two dataframes together
team_gold_diff_test = pd.merge(team_totals_test, match_gold_test, on='matchId', how='left')

In [64]:
#get the difference between totalGold at the 10th minute mark
team_gold_diff_test['teamGoldDiff'] = np.where(team_gold_diff_test['totalGold'] == team_gold_diff_test['max'], 
                                          team_gold_diff_test['max'] - team_gold_diff_test['min'], 
                                          team_gold_diff_test['min'] - team_gold_diff_test['max'])

team_gold_diff_test

Unnamed: 0,matchId,teamId,participantId,totalGold,max,min,teamGoldDiff
0,8000,100,15,17313,17313,16933,380
1,8000,200,40,16933,17313,16933,-380
2,8001,200,40,17533,17533,17492,41
3,8001,100,15,17492,17533,17492,-41
4,8002,200,40,17480,17480,16206,1274
...,...,...,...,...,...,...,...
3995,9997,100,15,14719,17129,14719,-2410
3996,9998,100,15,18353,18353,15134,3219
3997,9998,200,40,15134,18353,15134,-3219
3998,9999,100,15,14561,14561,13631,930


In [65]:
#convert the TeamId column to integers, since we set this up as strings
team_gold_diff_test['teamId'] = team_gold_diff_test['teamId'].astype('int64')
team_gold_diff_test

Unnamed: 0,matchId,teamId,participantId,totalGold,max,min,teamGoldDiff
0,8000,100,15,17313,17313,16933,380
1,8000,200,40,16933,17313,16933,-380
2,8001,200,40,17533,17533,17492,41
3,8001,100,15,17492,17533,17492,-41
4,8002,200,40,17480,17480,16206,1274
...,...,...,...,...,...,...,...
3995,9997,100,15,14719,17129,14719,-2410
3996,9998,100,15,18353,18353,15134,3219
3997,9998,200,40,15134,18353,15134,-3219
3998,9999,100,15,14561,14561,13631,930


In [66]:
#locate all non-zero team gold differentials
positive_differential_test = team_gold_diff_test[team_gold_diff_test['teamGoldDiff'] >= 0].sort_values('teamGoldDiff', ascending = False)

In [67]:
#reset index
positive_differential_test = positive_differential_test.reset_index()
positive_differential_test

Unnamed: 0,index,matchId,teamId,participantId,totalGold,max,min,teamGoldDiff
0,3724,9862,100,15,24229,24229,11237,12992
1,1134,8567,100,15,23649,23649,14093,9556
2,158,8079,100,15,20536,20536,11808,8728
3,490,8245,100,15,24461,24461,16128,8333
4,654,8327,200,40,22477,22477,14216,8261
...,...,...,...,...,...,...,...,...
1999,2462,9231,100,15,2500,2500,2500,0
2000,218,8109,100,15,2590,2590,2590,0
2001,219,8109,200,40,2590,2590,2590,0
2002,356,8178,100,15,2520,2520,2520,0


In [69]:
#grab the first 2000 rows
positive_differential_test = positive_differential_test[:2000]

In [70]:
#merge the teamGoldDifferential to the previously setup X_train dataset, that contains other point differentials
X_test = pd.merge(X_test[['matchId', 'top_champPoints_diff', 'jungle_champPoints_diff', 'mid_champPoints_diff', 'bot_champPoints_diff', 'support_champPoints_diff']], positive_differential_test[['matchId', 'teamGoldDiff']], on = ['matchId'], how = 'left')

In [72]:
#removing duplicates and resetting the index
X_test = X_test.drop_duplicates()
X_test = X_test.reset_index()
X_test = X_test.drop(columns = ['index'])

In [73]:
X_test = X_test.fillna(0)

In [74]:
logreg.predict(X_test.drop(columns = ['matchId']))

array([ True, False,  True, ..., False,  True,  True])

In [75]:
submissionV5 = X_test[['matchId']]

In [76]:
submissionV5 = submissionV5.copy()

In [77]:
submissionV5['win'] = logreg.predict(X_test.drop(columns = ['matchId']))

In [78]:
submissionV5['winner'] = np.where(submissionV5['win'] == True, 100, 200)

In [79]:
submissionV5 = submissionV5[['matchId', 'winner']]

In [80]:
submissionV5.to_csv('submissionV5.csv', index = False)