# Scraping and Populating Data Frame

In [3]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
from IPython.core.display import HTML

In [6]:
url = 'http://pokemondb.net/pokedex/all'
req = requests.get(url)
page = req.text
soup = BeautifulSoup(page, 'html.parser')

In [7]:
# HTML(str(soup.table))

AttributeError: 'HTML' object has no attribute 'head'

In [9]:
rows = [row for row in soup.find('table').find_all('tr')]
rows[800]

<tr>\n<td class="num cell-icon-string" data-sort-value="721"><i class="pki" data-sprite="pkiAll n721"></i> 721</td> <td class="cell-icon-string"><a class="ent-name" href="/pokedex/volcanion" title="View pokedex for #721 Volcanion">Volcanion</a></td><td class="cell-icon"><a class="type-icon type-fire" href="/type/fire">Fire</a><br><a class="type-icon type-water" href="/type/water">Water</a></br></td>\n<td class="num-total">600</td>\n<td class="num">80</td>\n<td class="num">110</td>\n<td class="num">120</td>\n<td class="num">130</td>\n<td class="num">90</td>\n<td class="num">70</td>\n</tr>

In [10]:
columns = [col.get_text() for col in rows[0].find_all('th') if col.get_text() != '']
columns

[u'#',
 u'Name',
 u'Type',
 u'Total',
 u'HP',
 u'Attack',
 u'Defense',
 u'Sp. Atk',
 u'Sp. Def',
 u'Speed']

In [11]:
indexes = [i for i in range(0, len(rows)-1)]
# indexes

In [12]:
values = [value.get_text() for row in rows[1:] for value in row.find_all('td')] 
# values

In [14]:
stacked_values = zip(*[values[i::10] for i in range(len(columns))])
# stacked_values

In [15]:
df = pd.DataFrame(stacked_values, columns = columns, index = indexes)
df.head()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,GrassPoison,318,45,49,49,65,65,45
1,2,Ivysaur,GrassPoison,405,60,62,63,80,80,60
2,3,Venusaur,GrassPoison,525,80,82,83,100,100,80
3,3,VenusaurMega Venusaur,GrassPoison,625,80,100,123,122,120,80
4,4,Charmander,Fire,309,39,52,43,60,50,65


# Data Cleaning

Time to clean the data we just scraped. For example, as you can see in the 4th row, the Name of the pokemon is VenusaurMega Venusaur. Similarly, in all the rows which have dual type pokemon have the type stored as Type1Type2.

We do not want that. Hence, we clean it!

In [16]:
# Replaces Word1Word2 Word3 with Word1 ( Word2 Word3 )
def clean_name(name):
    # name = 'VenusaurMega Venusaur'
    letters = [x for x in name]
    for i in range(1, len(letters)):
        if letters[i].isupper():
            letters[i] = ' ' + letters[i]
    fin_letters = ''.join(letters).split(' ')
    if len(fin_letters)>1:
        fin_letters.insert(1, '(')
        fin_letters.append(')')
    fin_name = ' '.join(fin_letters)
    return fin_name

# Replaces Type1Type2.. with Type1 Type2 ..
def clean_types(types):
    letters = [x for x in types]
    for i in range(1, len(types)):
        if letters[i].isupper():
            letters[i] = ' ' + letters[i]
    fin_type = ''.join(letters).split(' ')
    return fin_type


    

In [17]:
df['Name'] = df['Name'].apply(clean_name)
df['Type'] = df['Type'].apply(clean_types)
df.head()

# clean_name('VenusaurMega Venusaur')

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,"[Grass, Poison]",318,45,49,49,65,65,45
1,2,Ivysaur,"[Grass, Poison]",405,60,62,63,80,80,60
2,3,Venusaur,"[Grass, Poison]",525,80,82,83,100,100,80
3,3,Venusaur ( Mega Venusaur ),"[Grass, Poison]",625,80,100,123,122,120,80
4,4,Charmander,[Fire],309,39,52,43,60,50,65


In [18]:
df.to_json('PokemonData.json')

In [19]:
df = pd.read_json('PokemonData.json')
df=df.set_index(['#'])
# df

# Analysis

Time to analyse the data we just scraped and structured!

In [20]:
def max_stats(df, columns):
    max_message = ''
    for col in columns:
        stat = df[col].max()
        name = df[df[col] == df[col].max()]['Name'].values[0]
        max_message += name+' has the greatest '+col+' of '+str(stat)+'.\n'
    return max_message

def min_stats(df, col_list):
    '''Get Pokemon lowest value of the column in the Data Frame'''
    message = ''
    for col in col_list:
        stat = df[col].min()
        name = df[df[col]==df[col].min()]['Name'].values[0]
        message += name+' has the worst '+col+' of '+str(stat)+'.\n'
    return message


In [21]:
stats = ['Attack', 'Defense','HP', 'Sp. Atk','Sp. Def','Speed','Total']

print max_stats(df, stats)

print min_stats(df, stats)

Mewtwo ( Mega  Mewtwo  X ) has the greatest Attack of 190.
Steelix ( Mega  Steelix ) has the greatest Defense of 230.
Blissey has the greatest HP of 255.
Mewtwo ( Mega  Mewtwo  Y ) has the greatest Sp. Atk of 194.
Shuckle has the greatest Sp. Def of 230.
Deoxys ( Speed  Forme ) has the greatest Speed of 180.
Mewtwo ( Mega  Mewtwo  X ) has the greatest Total of 780.

Chansey has the worst Attack of 5.
Chansey has the worst Defense of 5.
Shedinja has the worst HP of 1.
Shuckle has the worst Sp. Atk of 10.
Caterpie has the worst Sp. Def of 20.
Shuckle has the worst Speed of 5.
Sunkern has the worst Total of 180.

