# The Data Legend (my capstone)

## The main notebook

#### Author: Hussain AlAttas (RUH)

---
**In this notebook, I explore the data, clean it, data engineer it, and perform a baseline accuracy.**

In [1]:
# Importing libraries
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import seaborn as sns
%matplotlib inline

In [2]:
# Loading the data
LOL = pd.read_csv('./LeagueofLegends.csv').dropna() # Dropping the unnecessary small amount of null values
Dictionary_df = pd.read_csv('./_columns.csv') # The DataFrame that explains the columns in the first set

In [3]:
# Creating a dataframe for only the players' usernames for later usage
Players_LOL_df = LOL[['blueTop','blueJungle','blueMiddle','blueADC','blueSupport',
                         'redTop','redJungle','redMiddle','redADC','redSupport']]

cols_to_remove = ['Address','blueTeamTag','redTeamTag','golddiff','goldblue',
                  'goldred','goldblueTop', 'goldblueJungle','goldblueMiddle',
                  'goldblueADC','goldblueSupport','goldredTop','goldredMiddle',
                  'goldredSupport','bInhibs','bDragons','bBarons','bHeralds',
                  'rKills', 'rTowers','rInhibs','rDragons','rBarons','rHeralds',
                  'blueBans', 'redBans', 'goldredJungle', 'goldredADC', 'bTowers',
                  'bKills', 'rResult']
LOL.drop(cols_to_remove,axis=1, inplace=True) # Removing columns that are unnecessary

In [4]:
# Removing the same columns from the dictionary dataframe
Dictionary_df.set_index('ColumnName').drop(cols_to_remove, inplace=True)

In [5]:
# Writing a function for doing the basic EDA checks
def Basic_eda_check(data):
    print('\n')
    print('This is the shape of the data', data.shape)
    display(data.head())
    display(data.tail())
    print('\n\n')
    print('There is {} null values'.format(data.isnull().sum().sum()))
    print('\n\n')
    display(data.describe().T)
    print('\n\n')
    data.info()
    print('\n\n')
    for x in data.columns:
        print(x)
        print(data[x].value_counts(dropna=False))
        print('\n\n\n\n')

In [6]:
Basic_eda_check(LOL) # Checking the set



This is the shape of the data (7582, 26)


Unnamed: 0,League,Year,Season,Type,bResult,gamelength,blueTop,blueTopChamp,blueJungle,blueJungleChamp,...,redTop,redTopChamp,redJungle,redJungleChamp,redMiddle,redMiddleChamp,redADC,redADCChamp,redSupport,redSupportChamp
0,NALCS,2015,Spring,Season,1,40,Dyrus,Irelia,Santorin,RekSai,...,Balls,Gnar,Meteos,Elise,Hai,Fizz,Sneaky,Sivir,LemonNation,Thresh
1,NALCS,2015,Spring,Season,0,38,Cris,Gnar,Impaler,Rengar,...,Gamsu,Irelia,Crumbzz,JarvanIV,Shiphtur,Azir,CoreJJ,Corki,KiWiKiD,Annie
2,NALCS,2015,Spring,Season,1,40,Flaresz,Renekton,ShorterACE,Rengar,...,Hauntzer,Sion,Saintvicious,LeeSin,Keane,Azir,Cop,Corki,BunnyFuFuu,Janna
3,NALCS,2015,Spring,Season,0,41,Rhux,Irelia,Rush,JarvanIV,...,Quas,Gnar,IWDominate,Nunu,Fenix,Lulu,KEITH,KogMaw,Xpecial,Janna
4,NALCS,2015,Spring,Season,1,35,Benny,Gnar,Xmithie,JarvanIV,...,CaliTrlolz8,Sion,Porpoise8,RekSai,Slooshi8,Lulu,Maplestreet8,Corki,Dodo8,Annie


Unnamed: 0,League,Year,Season,Type,bResult,gamelength,blueTop,blueTopChamp,blueJungle,blueJungleChamp,...,redTop,redTopChamp,redJungle,redJungleChamp,redMiddle,redMiddleChamp,redADC,redADCChamp,redSupport,redSupportChamp
7615,TCL,2018,Spring,Season,0,34,Elwind,Camille,Mojito,JarvanIV,...,fabFabulous,Chogath,Stomaged,XinZhao,GBM,Veigar,Zeitnot,Xayah,SnowFlower,Rakan
7616,TCL,2018,Spring,Season,0,39,Rare,Nasus,Viking,Khazix,...,Marshall,Chogath,KaKAO,Ivern,Lucete,Ryze,Ruvelius,Caitlyn,Japone,Braum
7617,OPL,2018,Spring,Season,0,24,Praedyth,Camille,Juves,JarvanIV,...,Chippys,Gangplank,Praelus,Evelynn,Triple,Malzahar,k1ng,Ezreal,Cupcake,Taric
7618,OPL,2018,Spring,Season,1,35,Ceres,Ornn,Sybol,Khazix,...,Papryze,Gangplank,Swathe,JarvanIV,Shok,Veigar,Low,Sivir,Tilting,Janna
7619,OPL,2018,Spring,Season,0,42,Papryze,Gangplank,Swathe,Zac,...,Ceres,Ornn,Sybol,JarvanIV,Claire,Malzahar,Raid,Vayne,Decoy,Braum





There is 0 null values





Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,7582.0,2016.282511,0.849968,2014.0,2016.0,2016.0,2017.0,2018.0
bResult,7582.0,0.544315,0.498065,0.0,0.0,1.0,1.0,1.0
gamelength,7582.0,37.009101,7.986554,17.0,31.0,36.0,41.0,95.0





<class 'pandas.core.frame.DataFrame'>
Int64Index: 7582 entries, 0 to 7619
Data columns (total 26 columns):
League              7582 non-null object
Year                7582 non-null int64
Season              7582 non-null object
Type                7582 non-null object
bResult             7582 non-null int64
gamelength          7582 non-null int64
blueTop             7582 non-null object
blueTopChamp        7582 non-null object
blueJungle          7582 non-null object
blueJungleChamp     7582 non-null object
blueMiddle          7582 non-null object
blueMiddleChamp     7582 non-null object
blueADC             7582 non-null object
blueADCChamp        7582 non-null object
blueSupport         7582 non-null object
blueSupportChamp    7582 non-null object
redTop              7582 non-null object
redTopChamp         7582 non-null object
redJungle           7582 non-null object
redJungleChamp      7582 non-null object
redMiddle           7582 non-null object
redMiddleChamp      7582 non-nul

Name: blueMiddle, Length: 322, dtype: int64





blueMiddleChamp
Orianna         643
Viktor          609
Syndra          526
Azir            481
Corki           446
Leblanc         417
Cassiopeia      391
Taliyah         388
Ryze            363
Ahri            334
Vladimir        309
Lulu            225
Lissandra       199
Zed             183
Jayce           180
Kassadin        167
Galio           154
Karma           153
TwistedFate     152
Malzahar        131
Varus           115
Ekko             97
Zilean           83
Lucian           77
Ezreal           69
Xerath           65
Anivia           54
Talon            52
Gangplank        44
Lux              43
               ... 
KogMaw           14
Urgot            13
Morgana          11
Veigar            9
Nautilus          8
JarvanIV          7
Quinn             7
Irelia            5
Jhin              4
Rumble            4
Graves            3
Nidalee           3
Pantheon          3
Kennen            3
MasterYi          2
Kayn           

In [16]:
Players_LOL_df = Players_LOL_df[Players_LOL_df.duplicated() == False] # Removing duplicates

# Getting a baseline accuracy using a K-Nearest Neigbours model

In [7]:
X = pd.get_dummies(LOL.drop('bResult', axis=1)) # Setting all of the columns except the target as the features matrix
y = LOL.bResult
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Splitting the data for modelling 
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [8]:
print(knn.score(X_test, y_test))
print(cross_val_score(knn,X,y,cv=10).mean())

In [9]:
list_of_scores = []
for i in range(1,30,2):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    list_of_scores.append(knn.score(X_test, y_test))
Scores = {'N':list(range(1,30,2)),'score':list_of_scores}
Scores = pd.DataFrame(Scores)
Scores.plot(x='N',y="score")

# Sorting the champions and the usernames

### Method #1  (Didn't actually use this method eventually)

In [11]:
bTop_champs = []
bJungle_champs = []
bMiddle_champs = []
bADC_champs = []
bSupport_champs = []

rTop_champs = []
rJungle_champs = []
rMiddle_champs = []
rADC_champs = []
rSupport_champs = []

CHAMPS = [bTop_champs, bJungle_champs, bMiddle_champs, bADC_champs,
          bSupport_champs, rTop_champs, rJungle_champs, rMiddle_champs,
          rADC_champs, rSupport_champs]

TEAMS = ['blueTopChamp', 'blueJungleChamp', 'blueMiddleChamp', 'blueADCChamp',
       'blueSupportChamp', 'redTopChamp', 'redJungleChamp', 'redMiddleChamp',
       'redADCChamp', 'redSupportChamp','blueTop',
       'blueJungle', 'blueMiddle', 'blueADC', 'blueSupport', 'redTop',
       'redJungle', 'redMiddle', 'redADC', 'redSupport']

All_b_teams=[]
for i in range(len(TEAMS)):
    for z in LOL[TEAMS[i]]:
        CHAMPS[i].append(z)
        Blue_Team = []
        for pl in CHAMPS[:5]:
            Blue_Team.append((''.join(pl)))
            All_b_teams.append(Blue_Team)

### Method #2 (I used this method instead)

In [12]:
def Add_strings(x):
    z = ''
    for i in x:
        z+=i
        z+=' '
    return z
LOL['BlueTeam'] = LOL[['blueTopChamp', 'blueJungleChamp', 'blueMiddleChamp', 'blueADCChamp',
       'blueSupportChamp']].apply(Add_strings, axis=1)

LOL['RedTeam'] = LOL[['redTopChamp', 'redJungleChamp', 'redMiddleChamp',
       'redADCChamp', 'redSupportChamp']].apply(Add_strings, axis=1) # First defining a function then applying it

LOL = LOL.drop(TEAMS,axis=1) # Dropping the columns that I just merged

# Building scraping functions to scrape data

## 1- Scraping more features for already existing players/champions

### The maximum needed inputs for the functions:
- Usernames
- Region
- Champions
- League

#### Using the requests library.

In [13]:
# Importing the libraries
import requests
from bs4 import BeautifulSoup
import selenium

In [14]:
# Defining a function to scrape the data off the first website.

def GetSummonerStats(username, region):
    username = username.replace(' ', '+') # Preparing the username
    Summoner = requests.get('http://{}.op.gg/summoner/userName={}'.format(region, username)) 
    Summoner = BeautifulSoup(Summoner.text, 'html.parser') # Getting the html of the website

    # Extracting the wanted data
    Total_games = Summoner.findAll('div', attrs={'class':'WinRatioTitle'})
    if len(Total_games) == 0:
        return 'Not enough games to get stats'
    for i in Total_games[0]:
        i=str(i)
        if i.count('total') != 0:
            total = i
        elif i.count('win') != 0:
            wins = i
        elif i.count('lose') != 0:
            losses = i
    total = int(total[20:24].replace('<','').replace('/','').replace('s','').replace('p','').replace('>',''))
    wins = int(wins[18:23].replace('<','').replace('/','').replace('s','').replace('p','').replace('>',''))
    losses = int(losses[18:23].replace('<','').replace('/','').replace('s','').replace('p','').replace('>',''))
    try:
        if int(total) < 10:
            return 'Not enough games to get stats'
        else:
            pass
    except:
        pass

    WinRate = list(Summoner.findAll('div', attrs={"class":"Text"}))
    for i,v in enumerate(WinRate):
        WinRate[i] = str(WinRate[i])  
    WinRate = WinRate[-1][18:22]
    WinRate = int(WinRate.replace('%','').replace('<',''))
    
    return total,wins,losses,WinRate

In [15]:
GetSummonerStats('steyex', 'euw')
# total, wins, losses, win rate

#### Using Selenuim

In [19]:
from selenium import webdriver # Importing selenium

# Defining 3 functions to scrape the data off the second website using selenuim instead.

# A function to scrape the summoner's statistics
def Get_S_Stats(username, region='euw'): 
    username = username.replace(' ','+')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(executable_path='./chromedriver/chromedriver',chrome_options=chrome_options,
  service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])    
    driver.get(f"https://www.leagueofgraphs.com/summoner/{region}/{username}")
    elem = driver.find_element_by_id("mainContent")
    stats = []
    for i in range(10):
        try:
            stats.append(int(elem.find_elements_by_class_name("pie-chart")[i].text))
        except:
            try:
                stats.append(float(elem.find_elements_by_class_name("pie-chart")[i].text[:-1]))
            except:
                pass
    driver.close()
    try:
        if stats[0] > 9:
            return stats
        else:
            return stats
    except:
        return 'Not enough games'

# A function to scrape the summoner's statistics specific to one champion
def Get_SC_Stats(username, champion, region='euw'): 
    username = username.replace(' ','+')
    champion = champion.lower().replace('.','').replace("'",'').replace(' ','')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(executable_path='./chromedriver/chromedriver',chrome_options=chrome_options,
  service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])    
    driver.get(f"https://www.leagueofgraphs.com/summoner/performance/{champion}/{region}/{username}")
    elem = driver.find_element_by_id("mainContent")
    stats = []
    for i in range(10):
        try:
            stats.append(int(elem.find_elements_by_class_name("pie-chart")[i].text))
        except:
            try:
                stats.append(float(elem.find_elements_by_class_name("pie-chart")[i].text[:-1]))
            except:
                pass
    driver.close()
    try:
        if stats[0] > 9:
            return stats
        else:
            return stats
    except:
        return 'Not enough games'

# A function to scrape the champion's statistics
def Get_C_Stats(champion):
    champion = champion.lower().replace('.','').replace("'",'').replace(' ','')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(executable_path='./chromedriver/chromedriver',chrome_options=chrome_options,
  service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
    driver.get(f"https://www.leagueofgraphs.com/champions/stats/{champion}")
    elem = driver.find_element_by_id("mainContent")
    stats = []
    for i in range(10):
        try:
            stats.append(int(elem.find_elements_by_class_name("pie-chart")[i].text))
        except:
            try:
                stats.append(float(elem.find_elements_by_class_name("pie-chart")[i].text[:-1]))
            except:
                pass
    driver.close()
    return stats


print(Get_SC_Stats('riv rodus', 'Lux'))  # [Games Played, Winrate]
print(Get_C_Stats('Lux'))                # [Popularity, Winrate, BanRate, Mained by]

Not enough games
[12.0, 51.1, 1.2, 1.8]


## 2- Scraping new players/champions

In [157]:
# Scraping new usernames from the website

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(executable_path='./chromedriver/chromedriver',chrome_options=chrome_options,
service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])    
driver.get('https://www.leagueofgraphs.com/rankings/summoners/euw')
elem = driver.find_element_by_id("mainContent")

New_players = []

for i in elem.find_elements_by_class_name("large-16")[0].find_elements_by_class_name('box'
)[0].find_elements_by_class_name('data_table')[0].find_elements_by_class_name('name'):
    New_players.append(Get_S_Stats(i.text))

driver.quit()

In [160]:
len(New_players)

69

In [164]:
New_players[-5:] # [Games Played, Winrate]

[[1160, 54.5], [384, 65.6], [861, 55.7], [1589, 54.1], [2196, 52.0]]

In [20]:
# Zipping each player's username to their champion

x,y = LOL['redTop'],LOL['redTopChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['RedTop'] = list(zip(X,Y))

x,y = LOL['redJungle'],LOL['redJungleChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['RedJungle'] = list(zip(X,Y))

x,y = LOL['redMiddle'],LOL['redMiddleChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['RedMiddle'] = list(zip(X,Y))


x,y = LOL['redADC'],LOL['redADCChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['RedADC'] = list(zip(X,Y))

x,y = LOL['redSupport'],LOL['redSupport']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['RedSupport'] = list(zip(X,Y))

x,y = LOL['blueTop'],LOL['blueTopChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['BlueTop'] = list(zip(X,Y))

x,y = LOL['blueJungle'],LOL['blueJungleChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['BlueJungle'] = list(zip(X,Y))

x,y = LOL['blueMiddle'],LOL['blueMiddleChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['BlueMiddle'] = list(zip(X,Y))

x,y = LOL['blueADC'],LOL['blueADCChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['BlueADC'] = list(zip(X,Y))

x,y = LOL['blueSupport'],LOL['blueSupportChamp']
X = []
Y = []
for i in x:
    X.append(i)
for i in y:
    Y.append(i)
LOL['BlueSupport'] = list(zip(X,Y))

In [21]:
LOL = LOL.drop(TEAMS,axis=1) # Dropping the individual columns for the usernames and champions

In [23]:
for i in LOL['RedTop']:
    print(Get_C_Stats(i[1]),i)
    break

# [2.7, 46.3, 0.6, 0.2] ('Balls', 'Gnar')

[2.6, 46.1, 0.6, 0.1] ('Balls', 'Gnar')


### Data engineering the newly scraped data

In [153]:
from functools import reduce
New_players_list2 = []
for i in New_players_list:
    x = reduce(lambda x, y: x + y, i) / len(i)
    New_players_list2.append(x)

In [154]:
sorted(New_players_list2) # Checking the average 

[53.17999999999999,
 53.52,
 53.96,
 54.28000000000001,
 54.580000000000005,
 55.05999999999999,
 55.31999999999999,
 55.52,
 55.739999999999995,
 56.32000000000001,
 57.44,
 58.720000000000006]