# Pokemon

### Introduction:

This time you will create the data.



### Step 1. Import the necessary libraries

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import pandas as pd

spark = SparkSession.builder.appName("Pokemon").getOrCreate()

### Step 2. Create a data dictionary that looks like the DataFrame below

In [5]:
raw_data = {"name": ['Bulbasaur', 'Charmander','Squirtle','Caterpie'],
            "evolution": ['Ivysaur','Charmeleon','Wartortle','Metapod'],
            "type": ['grass', 'fire', 'water', 'bug'],
            "hp": [45, 39, 44, 45],
            "pokedex": ['yes', 'no','yes','no']                        
            }

### Step 3. Assign it to a variable called pokemon

In [6]:
pokemon = spark.createDataFrame(pd.DataFrame(raw_data))

In [7]:
pokemon.show()

+----------+----------+-----+---+-------+
|      name| evolution| type| hp|pokedex|
+----------+----------+-----+---+-------+
| Bulbasaur|   Ivysaur|grass| 45|    yes|
|Charmander|Charmeleon| fire| 39|     no|
|  Squirtle| Wartortle|water| 44|    yes|
|  Caterpie|   Metapod|  bug| 45|     no|
+----------+----------+-----+---+-------+



### Step 4. Ops...it seems the DataFrame columns are in alphabetical order. Place  the order of the columns as name, type, hp, evolution, pokedex

In [9]:
pokemon = pokemon.select('name', 'type', 'hp', 'evolution', 'pokedex')

In [10]:
pokemon.show()

+----------+-----+---+----------+-------+
|      name| type| hp| evolution|pokedex|
+----------+-----+---+----------+-------+
| Bulbasaur|grass| 45|   Ivysaur|    yes|
|Charmander| fire| 39|Charmeleon|     no|
|  Squirtle|water| 44| Wartortle|    yes|
|  Caterpie|  bug| 45|   Metapod|     no|
+----------+-----+---+----------+-------+



### Step 5. Add another column called place, and insert what you have in mind.

In [84]:
import requests
from bs4 import BeautifulSoup
import re

def get_russian_name(x):
    r = requests.get(f'https://pokemon.fandom.com/wiki/{x}')
    soup = BeautifulSoup(r.text, 'html.parser')
    pattern = r'ru:[а-яА-я]+'
    res = re.findall(pattern, str(soup.find('script')))
    return res[0].split(':')[1] if res else None

rus_pok = F.udf(lambda x: get_russian_name(x))

In [85]:
rus_pok = pokemon.withColumn('russian_name', rus_pok(F.col('name')))

In [86]:
rus_pok.show()

+----------+-----+---+----------+-------+------------+
|      name| type| hp| evolution|pokedex|russian_name|
+----------+-----+---+----------+-------+------------+
| Bulbasaur|grass| 45|   Ivysaur|    yes|  Бульбазавр|
|Charmander| fire| 39|Charmeleon|     no|   Чармандер|
|  Squirtle|water| 44| Wartortle|    yes|        null|
|  Caterpie|  bug| 45|   Metapod|     no|        null|
+----------+-----+---+----------+-------+------------+



### Step 6. Present the type of each column

In [87]:
rus_pok.printSchema()

root
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- hp: long (nullable = true)
 |-- evolution: string (nullable = true)
 |-- pokedex: string (nullable = true)
 |-- russian_name: string (nullable = true)



### BONUS: Create your own question and answer it.

In [88]:
yes_no_to_bool = F.udf(lambda x: True if x == 'yes' else False, T.BooleanType())

In [89]:
rus_pok = rus_pok.withColumn('pokedex', yes_no_to_bool('pokedex'))
rus_pok.collect()

[Row(name='Bulbasaur', type='grass', hp=45, evolution='Ivysaur', pokedex=True, russian_name='Бульбазавр'),
 Row(name='Charmander', type='fire', hp=39, evolution='Charmeleon', pokedex=False, russian_name='Чармандер'),
 Row(name='Squirtle', type='water', hp=44, evolution='Wartortle', pokedex=True, russian_name=None),
 Row(name='Caterpie', type='bug', hp=45, evolution='Metapod', pokedex=False, russian_name=None)]

In [90]:
rus_pok.printSchema()

root
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- hp: long (nullable = true)
 |-- evolution: string (nullable = true)
 |-- pokedex: boolean (nullable = true)
 |-- russian_name: string (nullable = true)

