# Detect the best variables for each role so that we have variables to compare performance between a random player and our dataset

In [1]:
from datetime import datetime, timedelta
from functools import reduce

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

import sklearn.linear_model as linear
import sklearn.tree as tree
import sklearn.ensemble as rf
import sklearn.svm as svm
import sklearn.neural_network as neural

import sklearn.feature_selection as feat
import sklearn.metrics as metrics

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


# Prediction using isolated player ingame statistics

In [2]:
match_info = pd.read_csv("../data/match_info.csv")
laning = pd.read_csv("../data/player_laning_stats.csv")
combat = pd.read_csv("../data/player_combat_stats.csv")
flair = pd.read_csv("../data/player_flair_stats.csv")
objectives = pd.read_csv("../data/player_objective_stats.csv")

**Define a function that handles the cleaning and prediction for us:**

In [3]:
def get_prediction(data: pd.DataFrame, lane, model, key_features=[], train=0.9, random_seed=12, feature_selection=0):

    # selecting the lanes and dropping non useful columns
    data = data.loc[laning["lane"] == lane]
    data = data.drop(columns=["match_id","account_id", "region", "champion", "lane"])
    try:
        data = data.drop(columns=["patch", "date_created"])
    except:
        print("sorting columns not found.")
    
    # defining our target and variables
    target = data["won"]

    if len(key_features) > 0:
        variables = data[key_features]
    else:
        variables = data.loc[:, data.columns != "won"]

    # creating a list of columns so that we can return the top features
    columns = variables.columns.to_list()

    # standarazing our variables
    scale = StandardScaler()
    scale.fit(variables)
    variables = scale.transform(variables)
    del(scale)

    # splitting our test and train data
    variables_train, variables_test, target_train, target_test = train_test_split(variables, target, train_size=train, random_state=random_seed)

    # training the model
    model = model()
    model.fit(variables_train, target_train);

    # implementing feature selection if needed

    try:
        if feature_selection > 0:

            # recursive feature selection
            rfe = feat.RFE(model, n_features_to_select=feature_selection);
            rfe.fit(variables_train, target_train);
    except: 
        feature_selection = 0
    
    # returning multiple variables

    results = {
            "accuracy": round(model.score(variables_test, target_test), 3),
            #"balanced_accuracy":  round(metrics.balanced_accuracy_score(target_test, model.predict(variables_test)), 3),
            #"precision": round(metrics.precision_score(target_test, model.predict(variables_test)), 3),
            #"avg_precision": round(metrics.average_precision_score(target_train, model.predict(variables_train)), 3),
            "key_features": [columns[index] for index, ranking in enumerate(rfe.ranking_) if ranking < 4] if feature_selection > 0 else "No feature selection",
            }

    return results

## 1. Laning stats

In [4]:
get_prediction(laning, "TOP", tree.DecisionTreeClassifier)

{'accuracy': 0.528,
 'balanced_accuracy': 0.528,
 'precision': 0.522,
 'avg_precision': 1.0,
 'key_features': 'No feature selection'}

## 2. Combat stats

In [5]:
get_prediction(combat, "TOP", tree.DecisionTreeClassifier)

{'accuracy': 0.619,
 'balanced_accuracy': 0.619,
 'precision': 0.611,
 'avg_precision': 1.0,
 'key_features': 'No feature selection'}

## 3. Objective stats

In [6]:
get_prediction(objectives, "TOP", tree.DecisionTreeClassifier)

{'accuracy': 0.697,
 'balanced_accuracy': 0.697,
 'precision': 0.708,
 'avg_precision': 1.0,
 'key_features': 'No feature selection'}

## 4. Flair stats

In [7]:
get_prediction(flair, "TOP", tree.DecisionTreeClassifier)

{'accuracy': 0.597,
 'balanced_accuracy': 0.596,
 'precision': 0.617,
 'avg_precision': 0.69,
 'key_features': 'No feature selection'}

**From the above examples we see that we cannot have an accurate prediction using isolated statistics, we need more data, lets then combine all the player's ingame statistics**

# Prediction with merged player ingame statistics

## 1. Merge the stats and make a prediction for each role

In [7]:
shared = ["match_id", "account_id", "region", "champion", "lane", "won"]

complete_df = (pd.merge(laning, combat, on=shared, how="left")
                .merge(objectives, on=shared, how="left")
                .merge(flair, on=shared, how="left")
                .fillna(0))

In [9]:
for x in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]:
    print(f"{x}: {get_prediction(complete_df, x, rf.RandomForestClassifier, random_seed=12)}")
    del(x)

TOP: {'accuracy': 0.787, 'balanced_accuracy': 0.787, 'precision': 0.779, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
JUNGLE: {'accuracy': 0.835, 'balanced_accuracy': 0.836, 'precision': 0.819, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
MIDDLE: {'accuracy': 0.832, 'balanced_accuracy': 0.833, 'precision': 0.818, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
BOTTOM: {'accuracy': 0.871, 'balanced_accuracy': 0.871, 'precision': 0.858, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
SUPPORT: {'accuracy': 0.85, 'balanced_accuracy': 0.85, 'precision': 0.851, 'avg_precision': 1.0, 'key_features': 'No feature selection'}


## 2. Lets find the best features for TOP lane

In [10]:
key_features = get_prediction(complete_df, "TOP", rf.RandomForestClassifier, random_seed=12, feature_selection=9)["key_features"]

In [11]:
key_features

['goldpm_10',
 'dmg_total',
 'healing_total',
 'damage_mitigated',
 'crowd_control',
 'dmg_taken',
 'dmg_to_objectives',
 'dmg_to_turrets',
 'total_cs',
 'jungle_invaded',
 'longest_time_alive']

In [12]:
get_prediction(complete_df, "TOP", rf.RandomForestClassifier, random_seed=12, key_features=key_features)

{'accuracy': 0.784,
 'balanced_accuracy': 0.784,
 'precision': 0.776,
 'avg_precision': 1.0,
 'key_features': 'No feature selection'}

In [13]:
print(f"{len(key_features)} features selected out of {len(complete_df.drop(columns=['account_id', 'region', 'champion', 'lane']).columns)}")

11 features selected out of 27


# Using time and patch variables to see if that increases our accuracy

In [8]:
match_info["patch"] = pd.to_numeric(match_info["patch"], errors="coerce")
match_info.head()

Unnamed: 0,match_id,region,date_created,match_duration,patch,winner
0,4671787510,EUW1,2020-06-21,1557,10.12,Red
1,4671790912,EUW1,2020-06-21,1716,10.12,Blue
2,4704868250,EUW1,2020-07-13,1235,10.14,Red
3,4718171384,EUW1,2020-07-21,2227,10.14,Red
4,4718095374,EUW1,2020-07-21,1822,10.14,Blue


In [9]:
sorted_df = complete_df.merge(match_info[["match_id", "patch", "date_created"]], on="match_id", how="left").dropna()
sorted_df.head()

Unnamed: 0,match_id,account_id,region,champion,lane,xppm_10,cspm_10,goldpm_10,dmg_takenpm_10,won,dmg_total,healing_total,units_healed,damage_mitigated,crowd_control,dmg_taken,first_blood,first_blood_assist,dmg_to_objectives,dmg_to_turrets,total_cs,jungle_cs,jungle_invaded,wards_placed,wards_killed,killing_sprees,longest_time_alive,double_kills,triple_kills,quadra_kills,penta_kills,patch,date_created
0,4671787510,7UTpYZvoj06Si113SIlBe-jyteHrh-XRaYzuYfXWentoKm...,EUW1,62,JUNGLE,267.3,0.5,261.4,654.3,0,8575,6486,1,14108,72,22451,0,0,4718.0,119.0,32.0,57.0,4.0,6.0,6.0,1,495,0,0,0,0,10.12,2020-06-21
1,4671787510,fG8JDk5zVxKmoAMUqBIE8nbgMqzn8zuJrDJFPslxAg,EUW1,222,BOTTOM,342.6,6.9,345.3,243.3,0,15109,365,1,7401,92,16096,0,0,4964.0,4964.0,190.0,13.0,0.0,6.0,2.0,2,475,0,0,0,0,10.12,2020-06-21
2,4671787510,Kc97-m0MqgpSk3DFoY17uq39_Roh9Qvi-xtoEFXPsMhEWPY,EUW1,412,SUPPORT,274.1,1.1,289.8,311.0,0,7913,774,5,16784,125,16743,0,0,342.0,342.0,23.0,0.0,0.0,31.0,2.0,1,807,0,0,0,0,10.12,2020-06-21
3,4671787510,WmwA8a6PWVm1SkA3JWpID3CDFAbqxjrsU9f345u3_qR12c...,EUW1,555,MIDDLE,359.4,4.3,341.8,618.7,0,15786,4251,1,11752,152,21388,0,0,1646.0,1264.0,88.0,0.0,0.0,4.0,3.0,2,358,0,0,0,0,10.12,2020-06-21
4,4671787510,4NI6_UJFRWXe6swXbvsN9dQl6ORPjOfU1EA7ybGJjuc,EUW1,54,TOP,384.4,5.6,207.2,335.3,0,12793,979,1,34204,505,21185,0,0,1143.0,1143.0,150.0,0.0,0.0,9.0,2.0,1,726,0,0,0,0,10.12,2020-06-21


## 1. By Patch

In [10]:
last_patch = sorted_df.loc[sorted_df["patch"] == 10.14]
patches = sorted_df.loc[sorted_df["patch"] > 10.12]
patches_3 = sorted_df.loc[sorted_df["patch"] >= 10.12]

**Last patch**

In [23]:
for x in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]:
    print(f"{x}: {get_prediction(last_patch, x, rf.RandomForestClassifier, random_seed=12, sorted=True)}")
    del(x)

TOP: {'accuracy': 0.769, 'balanced_accuracy': 0.77, 'precision': 0.744, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
JUNGLE: {'accuracy': 0.839, 'balanced_accuracy': 0.839, 'precision': 0.834, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
MIDDLE: {'accuracy': 0.821, 'balanced_accuracy': 0.821, 'precision': 0.806, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
BOTTOM: {'accuracy': 0.852, 'balanced_accuracy': 0.852, 'precision': 0.843, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
SUPPORT: {'accuracy': 0.849, 'balanced_accuracy': 0.849, 'precision': 0.854, 'avg_precision': 1.0, 'key_features': 'No feature selection'}


**Last 2 patches**

In [25]:
for x in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]:
    print(f"{x}: {get_prediction(patches, x, rf.RandomForestClassifier, random_seed=12, sorted=True)}")
    del(x)

TOP: {'accuracy': 0.797, 'balanced_accuracy': 0.797, 'precision': 0.782, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
JUNGLE: {'accuracy': 0.834, 'balanced_accuracy': 0.834, 'precision': 0.821, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
MIDDLE: {'accuracy': 0.827, 'balanced_accuracy': 0.827, 'precision': 0.824, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
BOTTOM: {'accuracy': 0.867, 'balanced_accuracy': 0.867, 'precision': 0.86, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
SUPPORT: {'accuracy': 0.859, 'balanced_accuracy': 0.859, 'precision': 0.865, 'avg_precision': 1.0, 'key_features': 'No feature selection'}


**Last 3 patches**

In [29]:
for x in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]:
    print(f"{x}: {get_prediction(patches_3, x, rf.RandomForestClassifier, random_seed=12, sorted=True)}")
    del(x)

TOP: {'accuracy': 0.785, 'balanced_accuracy': 0.785, 'precision': 0.776, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
JUNGLE: {'accuracy': 0.839, 'balanced_accuracy': 0.839, 'precision': 0.83, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
MIDDLE: {'accuracy': 0.831, 'balanced_accuracy': 0.831, 'precision': 0.82, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
BOTTOM: {'accuracy': 0.867, 'balanced_accuracy': 0.867, 'precision': 0.857, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
SUPPORT: {'accuracy': 0.858, 'balanced_accuracy': 0.858, 'precision': 0.86, 'avg_precision': 1.0, 'key_features': 'No feature selection'}


## 2. By date

In [11]:
def get_date(days: int) -> str:
    since =  pd.to_datetime(sorted_df["date_created"].max()).date() - timedelta(days=days)
    if since.day < 10:
        day = f"0{since.day}"
    else:
        day = since.day
    if since.month < 10:
        month = f"0{since.month}"
    else:
        month = since.month

    since = f"{since.year}-{month}-{day}"
    return since

In [12]:
last_month = sorted_df.loc[sorted_df["date_created"] > get_date(30)]
two_weeks = sorted_df.loc[sorted_df["date_created"] > get_date(14)]
one_week = sorted_df.loc[sorted_df["date_created"] > get_date(7)]

**Last month**

In [80]:
for x in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]:
    print(f"{x}: {get_prediction(last_month, x, rf.RandomForestClassifier, random_seed=12, sorted=True)}")
    del(x)

TOP: {'accuracy': 0.786, 'balanced_accuracy': 0.786, 'precision': 0.773, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
JUNGLE: {'accuracy': 0.837, 'balanced_accuracy': 0.837, 'precision': 0.83, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
MIDDLE: {'accuracy': 0.834, 'balanced_accuracy': 0.834, 'precision': 0.821, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
BOTTOM: {'accuracy': 0.869, 'balanced_accuracy': 0.869, 'precision': 0.864, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
SUPPORT: {'accuracy': 0.865, 'balanced_accuracy': 0.865, 'precision': 0.866, 'avg_precision': 1.0, 'key_features': 'No feature selection'}


**Last two weeks**

In [81]:
for x in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]:
    print(f"{x}: {get_prediction(two_weeks, x, rf.RandomForestClassifier, random_seed=12, sorted=True)}")
    del(x)

TOP: {'accuracy': 0.791, 'balanced_accuracy': 0.791, 'precision': 0.784, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
JUNGLE: {'accuracy': 0.837, 'balanced_accuracy': 0.837, 'precision': 0.824, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
MIDDLE: {'accuracy': 0.82, 'balanced_accuracy': 0.82, 'precision': 0.814, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
BOTTOM: {'accuracy': 0.859, 'balanced_accuracy': 0.859, 'precision': 0.857, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
SUPPORT: {'accuracy': 0.848, 'balanced_accuracy': 0.848, 'precision': 0.845, 'avg_precision': 1.0, 'key_features': 'No feature selection'}


**Last week**

In [82]:
for x in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]:
    print(f"{x}: {get_prediction(one_week, x, rf.RandomForestClassifier, random_seed=12, sorted=True)}")
    del(x)

TOP: {'accuracy': 0.781, 'balanced_accuracy': 0.781, 'precision': 0.79, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
JUNGLE: {'accuracy': 0.804, 'balanced_accuracy': 0.803, 'precision': 0.807, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
MIDDLE: {'accuracy': 0.826, 'balanced_accuracy': 0.827, 'precision': 0.798, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
BOTTOM: {'accuracy': 0.86, 'balanced_accuracy': 0.861, 'precision': 0.851, 'avg_precision': 1.0, 'key_features': 'No feature selection'}
SUPPORT: {'accuracy': 0.851, 'balanced_accuracy': 0.851, 'precision': 0.853, 'avg_precision': 1.0, 'key_features': 'No feature selection'}


# Run multiple algorithms with and without features and store the results so that we can make a better analysis

In [54]:
def get_model_accuracy():

    # store results on a dictionary for future analysis
    model_accuracy = {
        "model": ["rf_classifier", "linear_ridge", "linear_logistic", "linear_svc", "linear_stochastic", "decision_tree", "neural_network", "support_vc"],
        "accuracy_avg": [],
    }

    # define the models to use
    models = {
        "rf_classifier": rf.RandomForestClassifier,
        "linear_ridge": linear.RidgeClassifier,
        "linear_logistic" : linear.LogisticRegression,
        "linear_svc": svm.LinearSVC,
        "linear_stochastic": linear.SGDClassifier,
        "decision_tree": tree.DecisionTreeClassifier,
        "neural_network": neural.MLPClassifier,
        "support_vc": svm.SVC,
    }

    # define the lanes
    lanes = ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]

    # make predictions without features
    for i, model in enumerate(models):
        results = []
        # return mean avg score without features
        for lane in lanes:
            prediction = get_prediction(last_month, lane, models[model], sorted=True)
            results.append(prediction["accuracy"])

        # append mean prediction result to model_accuracy
        model_accuracy["accuracy_avg"].append(float(format(np.mean(results), ".2f")))
        print(f"Done at {i}")

    print("Done without features")

    return model_accuracy

In [55]:
model_accuracy = get_model_accuracy()

Done at 0
Done at 1
Done at 2
Done at 3
Done at 4
Done at 5
Done at 6
Done at 7
Done without features


In [60]:
model_accuracy = pd.DataFrame(model_accuracy)
model_accuracy

Unnamed: 0,model,accuracy_avg
0,rf_classifier,0.84
1,linear_ridge,0.82
2,linear_logistic,0.83
3,linear_svc,0.83
4,linear_stochastic,0.83
5,decision_tree,0.77
6,neural_network,0.84
7,support_vc,0.85


**From the accuracy average I determined that RandomForestClassifier was the best approach, since it is not as high cost as support vector classification or Neural Networks**

In [58]:
model_accuracy.to_pickle("../data/model_accuracy.pkl", protocol=4)

## Feature selection and period accuracy

In [4]:
def get_model_acc_period():

    model_by_period = {
        "period": ["complete", "last_patch", "last_2_patches", "last_3_patches", "last_month", "last_two_weeks", "last_week"],
        "TOP": [],
        "JUNGLE": [],
        "MIDDLE": [],
        "BOTTOM": [],
        "SUPPORT": [],
    } 

    lane_features = {
        "TOP": [],
        "JUNGLE": [],
        "MIDDLE": [],
        "BOTTOM": [],
        "SUPPORT": [],
    }

    # define the iterations
    periods = {
        "complete": complete_df,
        "last_patch": last_patch,
        "last_2_patches": patches,
        "last_3_patches": patches_3,
        "last_month": last_month, 
        "last_two_weeks": two_weeks, 
        "last_week": one_week
        }

    # define the lanes
    lanes = ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "SUPPORT"]

    # without features
    for period in periods:
        for lane in lanes:
            prediction = get_prediction(periods[period], lane, rf.RandomForestClassifier)
            model_by_period[lane].append(prediction["accuracy"])

    for lane in lane_features:
        prediction = get_prediction(last_month, lane, rf.RandomForestClassifier, feature_selection=7)
        lane_features[lane].append(prediction["key_features"])

    return [model_by_period, lane_features]

In [13]:
results = get_model_acc_period()

sorting columns not found.
sorting columns not found.
sorting columns not found.
sorting columns not found.
sorting columns not found.


In [14]:
results[0]

{'period': ['complete',
  'last_patch',
  'last_2_patches',
  'last_3_patches',
  'last_month',
  'last_two_weeks',
  'last_week'],
 'TOP': [0.785, 0.773, 0.796, 0.785, 0.786, 0.786, 0.793],
 'JUNGLE': [0.834, 0.837, 0.837, 0.838, 0.838, 0.836, 0.816],
 'MIDDLE': [0.833, 0.824, 0.827, 0.832, 0.833, 0.827, 0.823],
 'BOTTOM': [0.871, 0.849, 0.867, 0.866, 0.868, 0.856, 0.86],
 'SUPPORT': [0.85, 0.85, 0.86, 0.86, 0.863, 0.849, 0.856]}

In [15]:
pd.DataFrame(results[0]).to_pickle("../data/model_by_period.pkl", protocol=4)

In [16]:
pd.DataFrame(results[1]).to_pickle("../data/lane_features.pkl", protocol=4)