# Web scraping to obtain League of Legends data
---
---

## Import relevant modules

In [1]:
import sys
import requests
import time
import random
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import chromedriver_binary

## Record date that data scraped
#### Last collected 2019-07-16 (FULL MOON)

In [2]:
# Get current date and time
now = datetime.datetime.now()
year_scraped  = str(now.year)
month_scraped = str(now.month)
day_scraped   = str(now.day)

# Add leading zeroes to single-digit months and days
if len(month_scraped) == 1:
    month_scraped = '0' + month_scraped
if len(day_scraped) == 1:
    day_scraped = '0' + day_scraped

# Construct date string
date_data = year_scraped + '-' + month_scraped + '-' + day_scraped

# Get champion names and release dates
---

In [3]:
champions_df = pd.read_html('https://leagueoflegends.fandom.com/wiki/List_of_champions')[1]
champions_df = champions_df[['Champion', 'Release Date']]
champions_df.columns = ['champion', 'release_date']
champions_df.head()

Unnamed: 0,champion,release_date
0,Aatrox the Darkin Blade,2013-06-13
1,Ahri the Nine-Tailed Fox,2011-12-14
2,Akali the Rogue Assassin,2010-05-11
3,Alistar the Minotaur,2009-02-21
4,Amumu the Sad Mummy,2009-06-26


### Get rid of champion titles

In [4]:
names = list(champions_df['champion'])
names = [s.split(',')[0] for s in names]
names = [s.split('\xa0the')[0] for s in names]
print(names[0:10])

['Aatrox', 'Ahri', 'Akali', 'Alistar', 'Amumu', 'Anivia', 'Annie', 'Ashe', 'Aurelion Sol', 'Azir']


# Get number of skins
---

In [5]:
# Set up selenium web driver
driver = webdriver.Chrome()

# Get number of skins
num_skins = []
for name in names:
    
    name = name.replace(' ', '_')
    skins_url = f'https://leagueoflegends.fandom.com/wiki/{name}/Skins'
    driver.get(skins_url)
    time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    num_skins.append(len(soup.find_all('div', {'style':'display:inline-block; margin:5px; width:342px'})))

driver.close()
print(num_skins[:10])

[6, 11, 12, 13, 11, 9, 12, 11, 3, 5]


In [6]:
# Subtract 1 from number of skins for each champion because of base skin
for idx, skins in enumerate(num_skins):
    num_skins[idx] = skins - 1
print(num_skins[:10])

[5, 10, 11, 12, 10, 8, 11, 10, 2, 4]


# Get pick, win, and ban rates
---

### Set up selenium webdriver

In [7]:
champstats_url = 'https://na.op.gg/statistics/champion/'

driver = webdriver.Chrome()
driver.get(champstats_url)

### Select stats for current day, ranked queue, and for all ranks

In [8]:
# Ranked queue and all ranks are already default, but today isn't
today_button = driver.find_element_by_xpath('//*[@id="recent_today"]/span/span')
today_button.click()

### Get win rate data

In [9]:
winrate_button = driver.find_element_by_xpath('//*[@id="rate_win"]/span/span')
winrate_button.click()

In [10]:
winrate_df = pd.read_html(driver.page_source)[1]
winrate_df = winrate_df[['Champion.1', 'Win rate']]
winrate_df.columns = ['champion', 'winrate']
winrate_df.sort_values(by = 'champion', inplace = True)
winrate_df.head()

Unnamed: 0,champion,winrate
128,Aatrox,47.38%
11,Ahri,52.75%
144,Akali,42.30%
126,Alistar,47.45%
15,Amumu,52.60%


### Get ban rate data

In [11]:
banrate_button = driver.find_element_by_xpath('//*[@id="rate_ban"]/span/span')
banrate_button.click()

In [12]:
banrate_df = pd.read_html(driver.page_source)[1]
banrate_df = banrate_df[['Champion.1', 'Ban ratio per game']]
banrate_df.columns = ['champion', 'banrate']
banrate_df.sort_values(by = 'champion', inplace = True)
banrate_df.head()

Unnamed: 0,champion,banrate
10,Aatrox,23.69%
37,Ahri,7.55%
31,Akali,8.75%
95,Alistar,1.08%
63,Amumu,2.57%


### Get pick rate data

In [13]:
pickrate_button = driver.find_element_by_xpath('//*[@id="rate_pick"]/span/span')
pickrate_button.click()

In [14]:
pickrate_df = pd.read_html(driver.page_source)[1]
pickrate_df = pickrate_df[['Champion.1', 'Pick ratio per game']]
pickrate_df.columns = ['champion', 'pickrate']
pickrate_df.sort_values(by = 'champion', inplace = True)
pickrate_df.head()

Unnamed: 0,champion,pickrate
25,Aatrox,10.64%
21,Ahri,11.22%
44,Akali,8.29%
104,Alistar,3.62%
65,Amumu,6.32%


In [15]:
driver.close()

# Get patch when champion was last changed

In [16]:
# Set up selenium web driver
driver = webdriver.Chrome()

# Get patch when champion was last changed
last_patch = []
for name in names:
    
    name = name.replace(' ', '_')
    champ_url = f'https://lol.gamepedia.com/{name}#Patch_History'
    driver.get(champ_url)
    #time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    history = [link for link in soup.find_all('a')
                 if '>v1.' in str(link) or 'Patch 1.' in str(link)
                 or '>v2.' in str(link) or 'Patch 2.' in str(link)
                 or '>v3.' in str(link) or 'Patch 3.' in str(link)
                 or '>v4.' in str(link) or 'Patch 4.' in str(link)
                 or '>v5.' in str(link) or 'Patch 5.' in str(link)
                 or '>v6.' in str(link) or 'Patch 6.' in str(link)
                 or '>v7.' in str(link) or 'Patch 7.' in str(link)
                 or '>v8.' in str(link) or 'Patch 8.' in str(link)
                 or '>v9.' in str(link) or 'Patch 9.' in str(link)]

    most_recent = history[0]
    most_recent = str(most_recent)[-8:-4]
    last_patch.append(most_recent)
    
driver.close()
print(last_patch[0:20])

['9.12', '8.20', '9.11', '8.21', '9.12', '8.24', '8.17', '9.12', ' 9.2', ' 9.7', '8.17', ' 9.9', '9.10', '8.18', '9.12', ' 9.8', ' 9.8', ' 9.7', '9.13', ' 9.9']


In [17]:
for idx, patch in enumerate(last_patch):
    last_patch[idx] = patch.replace('v','')
for idx, patch in enumerate(last_patch):
    last_patch[idx] = patch.replace(' ', '')
print(last_patch[:20])

['9.12', '8.20', '9.11', '8.21', '9.12', '8.24', '8.17', '9.12', '9.2', '9.7', '8.17', '9.9', '9.10', '8.18', '9.12', '9.8', '9.8', '9.7', '9.13', '9.9']


# Construct fully scraped data frame
---

In [18]:
# Create lists from columns of scraped data frames
release_date = list(champions_df['release_date'])
win_rate = list(winrate_df['winrate'])
ban_rate = list(banrate_df['banrate'])
pick_rate = list(pickrate_df['pickrate'])

In [19]:
# Create full data frame
data = list(zip(names, release_date, last_patch, num_skins, win_rate, ban_rate, pick_rate))
colnames = ['champion', 'release_date', 'last_patch', 'num_skins', 'win_rate', 'ban_rate', 'pick_rate']
scraped_df = pd.DataFrame(data, columns = colnames) 

In [20]:
scraped_df['date_data'] = date_data
scraped_df.head(20)

Unnamed: 0,champion,release_date,last_patch,num_skins,win_rate,ban_rate,pick_rate,date_data
0,Aatrox,2013-06-13,9.12,5,47.38%,23.69%,10.64%,2019-07-17
1,Ahri,2011-12-14,8.2,10,52.75%,7.55%,11.22%,2019-07-17
2,Akali,2010-05-11,9.11,11,42.30%,8.75%,8.29%,2019-07-17
3,Alistar,2009-02-21,8.21,12,47.45%,1.08%,3.62%,2019-07-17
4,Amumu,2009-06-26,9.12,10,52.60%,2.57%,6.32%,2019-07-17
5,Anivia,2009-07-10,8.24,8,50.65%,1.34%,3.31%,2019-07-17
6,Annie,2009-02-21,8.17,11,51.05%,1.41%,3.46%,2019-07-17
7,Ashe,2009-02-21,9.12,10,51.78%,2.17%,15.13%,2019-07-17
8,Aurelion Sol,2016-03-24,9.2,2,50.95%,0.30%,0.93%,2019-07-17
9,Azir,2014-09-16,9.7,4,46.96%,0.80%,3.52%,2019-07-17


### Write data frame to csv file

In [21]:
filename = f'scraped_data_{date_data}.csv'
filedir = f'/Users/jeremy_lehner/Documents/GitHub/metis_project2/data/raw/{filename}'

scraped_df.to_csv (filedir, index = None, header = True)