In [60]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import sqlite3
from time import sleep
import re
import warnings
warnings.filterwarnings("ignore")

### Getting player ids of verified players from Dotabuff

```python
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
    '537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
source = requests.get('https://www.dotabuff.com/players', headers=headers)
soup = BeautifulSoup(source.text, 'lxml')
pids = [i['href'].split("/")[2] for i in soup.select('td a.link-type-player')]
```

### Pickup where we left

In [164]:
def pickup_impact(pid_list):
    try:
        pids_impact = pd.read_sql('''SELECT DISTINCT PID FROM Impact''', 
                                  conn)['PID'].unique()
        new_impact = [i for i in pids if i not in pids_impact]   
    except:
        new_impact = pid_list  
    return new_impact

In [10]:
def pickup_economy(pid_list):
    try:
        pids_economy = pd.read_sql('''SELECT DISTINCT PID FROM Economy''', 
                                   conn)['PID'].unique()
        new_economy = [i for i in pids if i not in pids_economy]   
    except:
        new_economy = pid_list  
    return new_economy

In [11]:
def pickup_played(pid_list):
    try:
        pids_played = pd.read_sql('''SELECT DISTINCT PID FROM Played''', 
                                  conn)['PID'].unique()
        new_played = [i for i in pids if i not in pids_played]   
    except:
        new_played = pid_list  
    return new_played

In [12]:
def pickup_itemeco(pid_list):
    try:
        pids_itemeco = pd.read_sql('''SELECT DISTINCT PID FROM ItemEco''', 
                                   conn)['PID'].unique()
        new_itemeco = [i for i in pids if i not in pids_itemeco]   
    except:
        new_itemeco = pid_list  
    return new_itemeco

In [163]:
def pickup_itemmain(pid_list):
    try:
        pids_itemmain = pd.read_sql('''SELECT DISTINCT PID FROM ItemMain''', 
                                    conn)['PID'].unique()
        new_itemmain = [i for i in pids if i not in pids_itemmain]   
    except:
        new_itemmain = pid_list  
    return new_itemmain

## Scraping using Selenium

In [2]:
chrome_path = r'C:\Users\jedda\Downloads\chromedriver.exe'
conn = sqlite3.connect('dota.db')

### Impact

```python
pids_impact = pickup_impact(pids)
driver = webdriver.Chrome(chrome_path)
for p in pids_impact:
    driver.get('https://www.dotabuff.com/players/'+p+'/heroes?metric=impact')
    query = driver.find_elements_by_xpath(
        '/html/body/div[1]/div[8]/div[3]/section/article/table/tbody/*')
    rows = [tuple(i.text.split('\n')) for i in query]
    header = ['Hero', 'Date', 'KDA Ratio', 'Kills', 'Death', 'Assists']
    impactdb = pd.DataFrame(rows, columns=header)
    impactdb.drop(columns='Date', inplace=True)
    impactdb['PID'] = p
    impactdb.to_sql('Impact', conn, if_exists='append')
    sleep(2)
driver.close()
```

### Economy

```python
pids_economy = pickup_economy(pids)
driver = webdriver.Chrome(chrome_path)
for p in pids_economy:
    driver.get('https://www.dotabuff.com/players/'+p+'/heroes?metric=economy')
    query = driver.find_elements_by_xpath(
        '/html/body/div[1]/div[8]/div[3]/section/article/table/tbody/*')
    rows = [tuple(i.text.split('\n')) for i in query]
    header = ['Hero', 'Date', 'GPM', 'XPM']
    economydb = pd.DataFrame(rows, columns=header)
    economydb.drop(columns='Date', inplace=True)
    economydb['PID'] = p
    economydb.to_sql('Economy', conn, if_exists='append')
    sleep(2)
driver.close()
```

### Played

```python
pids_played = pickup_played(pids)
driver = webdriver.Chrome(chrome_path)
for p in pids_played:
    driver.get('https://www.dotabuff.com/players/'+p+'/heroes?metric=played')
    query = driver.find_elements_by_xpath(
        '/html/body/div[1]/div[8]/div[3]/section/article/table/tbody/*')
    rows = [tuple(i.text.split('\n')) for i in query]
    header = ['Hero', 'Date', 'Matches',
              'Win Rate', 'KDA Ratio', 'Core', 'Safe']
    playeddb = pd.DataFrame(rows, columns=header)
    playeddb.drop(columns=['Date', 'Core', 'Safe'], inplace=True)
    playeddb['PID'] = p
    playeddb.to_sql('Played', conn, if_exists='append')
    sleep(2)
driver.close()
```

### Item Economy

```python
pids_itemeco = pickup_itemeco(pids)
driver = webdriver.Chrome(chrome_path)
for p in pids_itemeco:
    driver.get('https://www.dotabuff.com/players/'+p+'/items?metric=economy')
    query = driver.find_elements_by_xpath(
        '/html/body/div[1]/div[8]/div[3]/section/article/table/tbody/*')
    raw = [i.text.split('\n') for i in query]
    rows = [[(i, j, r[1]) for i, j in re.findall(
        r'(.*)\s(.*?\d+)$', r[0])][0] for r in raw]
    header = ['Item', 'GPM', 'XPM']
    itemecodb = pd.DataFrame(rows, columns=header)
    itemecodb['PID'] = p
    itemecodb.to_sql('ItemEco', conn, if_exists='append')
    sleep(2)
driver.close()
```

### Item Main

```python
pids_itemmain = pickup_itemmain(pids)
driver = webdriver.Chrome(chrome_path)
for p in pids_itemmain:
    driver.get('https://www.dotabuff.com/players/'+p+'/items?metric=used')
    query = driver.find_elements_by_xpath(
        '/html/body/div[1]/div[8]/div[3]/section/article/table/tbody/*')
    raw = [i.text.split('\n')[:3] for i in query]
    rows = [[(re.sub(r'\s\(level \d+\)$', '', i), j, r[1], r[2])
            for i, j in re.findall(r'(.*)\s(.*?\d+)$', r[0])][0] for r in raw]
    header = ['Item', 'Matches', 'WinRate', 'KDA_Ratio']
    itemmaindb = pd.DataFrame(rows, columns=header)
    itemmaindb['PID'] = p
    itemmaindb.to_sql('ItemMain', conn, if_exists='append')
    sleep(2)
driver.close()
```

### Cleaning up data (removing commas and % and removed heroes)

In [166]:
conn.executescript('''
            UPDATE Impact
            SET Kills = REPLACE(Kills, ',', ''),
                Death = REPLACE(Death, ',', ''),
                Assists = REPLACE(Assists, ',', '');
                
            DELETE FROM Impact
            WHERE Hero LIKE "Removed%";
                
            UPDATE Economy
            SET GPM = REPLACE(GPM, ',', ''),
                XPM = REPLACE(XPM, ',', '');
                
            DELETE FROM Economy
            WHERE Hero LIKE "Removed%";
                
            UPDATE Played
            SET Matches = REPLACE(Matches, ',', ''),
                "Win Rate" = REPLACE("Win Rate", '%', '');
                
            DELETE FROM Played
            WHERE Hero LIKE "Removed%";
                
            UPDATE ItemEco
            SET GPM = REPLACE(GPM, ',', ''),
                XPM = REPLACE(XPM, ',', '');
                
            DELETE FROM ItemEco
            WHERE Item LIKE "Removed%" OR Item LIKE "Recipe%"
            OR Item LIKE "Refresher Shard%" OR Item LIKE "River Vial%";
            
            UPDATE ItemMain
            SET Matches = REPLACE(Matches, ',', ''),
                WinRate = REPLACE("Win Rate", '%', '');
                
            DELETE FROM ItemMain
            WHERE Item LIKE "Removed%" OR Item LIKE "Recipe%"
            OR Item LIKE "Refresher Shard%" OR Item LIKE "River Vial%";
            ''')
conn.commit();

# ITAY Eto po yung tables to copy:
## Pacheck na lang po conn names

In [None]:
pd.read_sql('''SELECT * FROM Impact''', conn).to_sql('impact', connfinal)
pd.read_sql('''SELECT * FROM Economy''', conn).to_sql('economy', connfinal)
pd.read_sql('''SELECT * FROM Played''', conn).to_sql('played', connfinal)
pd.read_sql('''SELECT * FROM ItemEco''', conn).to_sql('itemeco', connfinal)
pd.read_sql('''SELECT * FROM ItemMain''', conn).to_sql('itemmain', connfinal)