In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = ( 
    SparkSession\
    .builder\
    .master('local')
    .appName('df_fifa_world_cup_3')
    .getOrCreate()
)

In [3]:
df = spark.read.csv('arquivo/wc2018-players.csv', header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- Team: string (nullable = true)
 |-- #: integer (nullable = true)
 |-- Pos.: string (nullable = true)
 |-- FIFA Popular Name: string (nullable = true)
 |-- Birth Date: string (nullable = true)
 |-- Shirt Name: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)



In [5]:
df = df.withColumnRenamed('Team', 'Selecao')\
       .withColumnRenamed('#', 'Numero')\
       .withColumnRenamed('Pos.', 'Posicao')\
       .withColumnRenamed('FIFA Popular Name', 'Nome_FIFA')\
       .withColumnRenamed('Birth Date', 'Data_Nascimento')\
       .withColumnRenamed('Shirt Name', 'Nome_Camiseta')\
       .withColumnRenamed('Club', 'Time')\
       .withColumnRenamed('Height', 'Altura')\
       .withColumnRenamed('Weight', 'Peso')

In [6]:
df.show(5)

+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+
|  Selecao|Numero|Posicao|         Nome_FIFA|Data_Nascimento|Nome_Camiseta|                Time|Altura|Peso|
+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|     31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|
|Argentina|    22|     MF|    PAVON Cristian|     21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65|
|Argentina|    15|     MF|    LANZINI Manuel|     15.02.1993|      LANZINI|West Ham United F...|   167|  66|
|Argentina|    18|     DF|    SALVIO Eduardo|     13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69|
|Argentina|    10|     FW|      MESSI Lionel|     24.06.1987|        MESSI|  FC Barcelona (ESP)|   170|  72|
+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+
only showing top 5 

In [7]:
df.printSchema()

root
 |-- Selecao: string (nullable = true)
 |-- Numero: integer (nullable = true)
 |-- Posicao: string (nullable = true)
 |-- Nome_FIFA: string (nullable = true)
 |-- Data_Nascimento: string (nullable = true)
 |-- Nome_Camiseta: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Altura: integer (nullable = true)
 |-- Peso: integer (nullable = true)



In [9]:
df = df.withColumn('Ano', substring('Data_Nascimento', -4,4))
df = df.withColumn('Mes', substring('Data_Nascimento', -7,2))
df = df.withColumn('Dia', substring('Data_Nascimento', -10,2))

In [10]:
df.show(5)

+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+----+---+---+
|  Selecao|Numero|Posicao|         Nome_FIFA|Data_Nascimento|Nome_Camiseta|                Time|Altura|Peso| Ano|Mes|Dia|
+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+----+---+---+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|     31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|1992| 08| 31|
|Argentina|    22|     MF|    PAVON Cristian|     21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65|1996| 01| 21|
|Argentina|    15|     MF|    LANZINI Manuel|     15.02.1993|      LANZINI|West Ham United F...|   167|  66|1993| 02| 15|
|Argentina|    18|     DF|    SALVIO Eduardo|     13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69|1990| 07| 13|
|Argentina|    10|     FW|      MESSI Lionel|     24.06.1987|        MESSI|  FC Barcelona (ESP)|   170|  72|1987| 06| 24|
+---------+------+------

In [13]:
df = df.withColumn('Nascimento', concat_ws('-', 'Ano', 'Mes', 'Dia').cast(DateType()))

In [19]:
df.show(5)
df

+---------+------+-------+------------------+-------------+--------------------+------+----+----+---+---+----------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome_Camiseta|                Time|Altura|Peso| Ano|Mes|Dia|Nascimento|
+---------+------+-------+------------------+-------------+--------------------+------+----+----+---+---+----------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|1992| 08| 31|1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65|1996| 01| 21|1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66|1993| 02| 15|1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69|1990| 07| 13|1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72|1987| 06| 24|1987-06-24|
+---------+------+-------+------------------+-------------+-----

DataFrame[Selecao: string, Numero: int, Posicao: string, Nome_FIFA: string, Nome_Camiseta: string, Time: string, Altura: int, Peso: int, Ano: string, Mes: string, Dia: string, Nascimento: date]

In [20]:
df = df.drop('Data_Nascimento')
df.printSchema()

root
 |-- Selecao: string (nullable = true)
 |-- Numero: integer (nullable = true)
 |-- Posicao: string (nullable = true)
 |-- Nome_FIFA: string (nullable = true)
 |-- Nome_Camiseta: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Altura: integer (nullable = true)
 |-- Peso: integer (nullable = true)
 |-- Ano: string (nullable = true)
 |-- Mes: string (nullable = true)
 |-- Dia: string (nullable = true)
 |-- Nascimento: date (nullable = true)



In [22]:
df = df.drop('Ano', 'Mes', 'Dia')
df

DataFrame[Selecao: string, Numero: int, Posicao: string, Nome_FIFA: string, Nome_Camiseta: string, Time: string, Altura: int, Peso: int, Nascimento: date]

In [24]:
df.show(5)

+---------+------+-------+------------------+-------------+--------------------+------+----+----------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome_Camiseta|                Time|Altura|Peso|Nascimento|
+---------+------+-------+------------------+-------------+--------------------+------+----+----------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65|1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66|1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69|1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72|1987-06-24|
+---------+------+-------+------------------+-------------+--------------------+------+----+----------+
only showing top 5 rows



In [26]:
df2 = df

In [33]:
df.select(col('Selecao')).distinct().show(32)

+--------------+
|       Selecao|
+--------------+
|        Russia|
|       Senegal|
|        Sweden|
|       IR Iran|
|       Germany|
|        France|
|     Argentina|
|       Belgium|
|          Peru|
|       Croatia|
|       Nigeria|
|Korea Republic|
|         Spain|
|       Denmark|
|       Morocco|
|        Panama|
|       Iceland|
|       Uruguay|
|        Mexico|
|       Tunisia|
|  Saudi Arabia|
|   Switzerland|
|        Brazil|
|         Japan|
|       England|
|        Poland|
|      Portugal|
|     Australia|
|    Costa Rica|
|         Egypt|
|        Serbia|
|      Colombia|
+--------------+

