In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
spark = ( 
    SparkSession\
    .builder\
    .master('local')
    .appName('df_fifa_world_cup_3')
    .getOrCreate()
)

In [4]:
df = spark.read.csv('arquivo/wc2018-players.csv', header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- Team: string (nullable = true)
 |-- #: integer (nullable = true)
 |-- Pos.: string (nullable = true)
 |-- FIFA Popular Name: string (nullable = true)
 |-- Birth Date: string (nullable = true)
 |-- Shirt Name: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)



In [6]:
df = df.withColumnRenamed('Team', 'Selecao')\
       .withColumnRenamed('#', 'Numero')\
       .withColumnRenamed('Pos.', 'Posicao')\
       .withColumnRenamed('FIFA Popular Name', 'Nome_FIFA')\
       .withColumnRenamed('Birth Date', 'Data_Nascimento')\
       .withColumnRenamed('Shirt Name', 'Nome_Camiseta')\
       .withColumnRenamed('Club', 'Time')\
       .withColumnRenamed('Height', 'Altura')\
       .withColumnRenamed('Weight', 'Peso')

In [7]:
df.show(5)

+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+
|  Selecao|Numero|Posicao|         Nome_FIFA|Data_Nascimento|Nome_Camiseta|                Time|Altura|Peso|
+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|     31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|
|Argentina|    22|     MF|    PAVON Cristian|     21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65|
|Argentina|    15|     MF|    LANZINI Manuel|     15.02.1993|      LANZINI|West Ham United F...|   167|  66|
|Argentina|    18|     DF|    SALVIO Eduardo|     13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69|
|Argentina|    10|     FW|      MESSI Lionel|     24.06.1987|        MESSI|  FC Barcelona (ESP)|   170|  72|
+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+
only showing top 5 

In [8]:
df.printSchema()

root
 |-- Selecao: string (nullable = true)
 |-- Numero: integer (nullable = true)
 |-- Posicao: string (nullable = true)
 |-- Nome_FIFA: string (nullable = true)
 |-- Data_Nascimento: string (nullable = true)
 |-- Nome_Camiseta: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Altura: integer (nullable = true)
 |-- Peso: integer (nullable = true)



In [9]:
df = df.withColumn('Ano', substring('Data_Nascimento', -4,4))
df = df.withColumn('Mes', substring('Data_Nascimento', -7,2))
df = df.withColumn('Dia', substring('Data_Nascimento', -10,2))

In [10]:
df.show(5)

+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+----+---+---+
|  Selecao|Numero|Posicao|         Nome_FIFA|Data_Nascimento|Nome_Camiseta|                Time|Altura|Peso| Ano|Mes|Dia|
+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+----+---+---+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|     31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|1992| 08| 31|
|Argentina|    22|     MF|    PAVON Cristian|     21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65|1996| 01| 21|
|Argentina|    15|     MF|    LANZINI Manuel|     15.02.1993|      LANZINI|West Ham United F...|   167|  66|1993| 02| 15|
|Argentina|    18|     DF|    SALVIO Eduardo|     13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69|1990| 07| 13|
|Argentina|    10|     FW|      MESSI Lionel|     24.06.1987|        MESSI|  FC Barcelona (ESP)|   170|  72|1987| 06| 24|
+---------+------+------

In [11]:
df = df.withColumn('Nascimento', concat_ws('-', 'Ano', 'Mes', 'Dia').cast(DateType()))

In [12]:
df.show(5)
df

+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+----+---+---+----------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Data_Nascimento|Nome_Camiseta|                Time|Altura|Peso| Ano|Mes|Dia|Nascimento|
+---------+------+-------+------------------+---------------+-------------+--------------------+------+----+----+---+---+----------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|     31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|1992| 08| 31|1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|     21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65|1996| 01| 21|1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|     15.02.1993|      LANZINI|West Ham United F...|   167|  66|1993| 02| 15|1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|     13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69|1990| 07| 13|1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|     24.06.1987|        

DataFrame[Selecao: string, Numero: int, Posicao: string, Nome_FIFA: string, Data_Nascimento: string, Nome_Camiseta: string, Time: string, Altura: int, Peso: int, Ano: string, Mes: string, Dia: string, Nascimento: date]

In [13]:
df = df.drop('Data_Nascimento')
df.printSchema()

root
 |-- Selecao: string (nullable = true)
 |-- Numero: integer (nullable = true)
 |-- Posicao: string (nullable = true)
 |-- Nome_FIFA: string (nullable = true)
 |-- Nome_Camiseta: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Altura: integer (nullable = true)
 |-- Peso: integer (nullable = true)
 |-- Ano: string (nullable = true)
 |-- Mes: string (nullable = true)
 |-- Dia: string (nullable = true)
 |-- Nascimento: date (nullable = true)



In [14]:
df = df.drop('Ano', 'Mes', 'Dia')
df

DataFrame[Selecao: string, Numero: int, Posicao: string, Nome_FIFA: string, Nome_Camiseta: string, Time: string, Altura: int, Peso: int, Nascimento: date]

In [15]:
df.show(5)

+---------+------+-------+------------------+-------------+--------------------+------+----+----------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome_Camiseta|                Time|Altura|Peso|Nascimento|
+---------+------+-------+------------------+-------------+--------------------+------+----+----------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65|1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66|1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69|1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72|1987-06-24|
+---------+------+-------+------------------+-------------+--------------------+------+----+----------+
only showing top 5 rows



In [16]:
df2 = df

In [17]:
df.select(col('Selecao')).distinct().show(32)

+--------------+
|       Selecao|
+--------------+
|        Russia|
|       Senegal|
|        Sweden|
|       IR Iran|
|       Germany|
|        France|
|     Argentina|
|       Belgium|
|          Peru|
|       Croatia|
|       Nigeria|
|Korea Republic|
|         Spain|
|       Denmark|
|       Morocco|
|        Panama|
|       Iceland|
|       Uruguay|
|        Mexico|
|       Tunisia|
|  Saudi Arabia|
|   Switzerland|
|        Brazil|
|         Japan|
|       England|
|        Poland|
|      Portugal|
|     Australia|
|    Costa Rica|
|         Egypt|
|        Serbia|
|      Colombia|
+--------------+



In [18]:
lista = df.select(col('Selecao')).distinct().collect()
lista

[Row(Selecao='Russia'),
 Row(Selecao='Senegal'),
 Row(Selecao='Sweden'),
 Row(Selecao='IR Iran'),
 Row(Selecao='Germany'),
 Row(Selecao='France'),
 Row(Selecao='Argentina'),
 Row(Selecao='Belgium'),
 Row(Selecao='Peru'),
 Row(Selecao='Croatia'),
 Row(Selecao='Nigeria'),
 Row(Selecao='Korea Republic'),
 Row(Selecao='Spain'),
 Row(Selecao='Denmark'),
 Row(Selecao='Morocco'),
 Row(Selecao='Panama'),
 Row(Selecao='Iceland'),
 Row(Selecao='Uruguay'),
 Row(Selecao='Mexico'),
 Row(Selecao='Tunisia'),
 Row(Selecao='Saudi Arabia'),
 Row(Selecao='Switzerland'),
 Row(Selecao='Brazil'),
 Row(Selecao='Japan'),
 Row(Selecao='England'),
 Row(Selecao='Poland'),
 Row(Selecao='Portugal'),
 Row(Selecao='Australia'),
 Row(Selecao='Costa Rica'),
 Row(Selecao='Egypt'),
 Row(Selecao='Serbia'),
 Row(Selecao='Colombia')]

In [19]:
type(lista[0][0])

str

In [20]:
lista[30]

Row(Selecao='Serbia')

In [21]:
paises = []

for pais in lista:
    paises.append(pais[0])

paises

['Russia',
 'Senegal',
 'Sweden',
 'IR Iran',
 'Germany',
 'France',
 'Argentina',
 'Belgium',
 'Peru',
 'Croatia',
 'Nigeria',
 'Korea Republic',
 'Spain',
 'Denmark',
 'Morocco',
 'Panama',
 'Iceland',
 'Uruguay',
 'Mexico',
 'Tunisia',
 'Saudi Arabia',
 'Switzerland',
 'Brazil',
 'Japan',
 'England',
 'Poland',
 'Portugal',
 'Australia',
 'Costa Rica',
 'Egypt',
 'Serbia',
 'Colombia']

In [22]:
df.withColumn('Coluna_Nova', when(col('Selecao') == 'Argentina', 'Argentinos').otherwise('Não Argentinos')).show(50)

+---------+------+-------+------------------+-------------+--------------------+------+----+----------+--------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nome_Camiseta|                Time|Altura|Peso|Nascimento|   Coluna_Nova|
+---------+------+-------+------------------+-------------+--------------------+------+----+----------+--------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65|1992-08-31|    Argentinos|
|Argentina|    22|     MF|    PAVON Cristian|        PAVÓN|CA Boca Juniors (...|   169|  65|1996-01-21|    Argentinos|
|Argentina|    15|     MF|    LANZINI Manuel|      LANZINI|West Ham United F...|   167|  66|1993-02-15|    Argentinos|
|Argentina|    18|     DF|    SALVIO Eduardo|       SALVIO|    SL Benfica (POR)|   167|  69|1990-07-13|    Argentinos|
|Argentina|    10|     FW|      MESSI Lionel|        MESSI|  FC Barcelona (ESP)|   170|  72|1987-06-24|    Argentinos|
|Argentina|     4|     DF|  ANSALDI Cristian|   

In [23]:
europa        = ['Sweden', 'Germany', 'France', 'Belgium', 'Croatia', 'Spain', 'Denmark', 'Ireland', 'Switzerland', 'England', 'Poland', 'Portugal', 'Iceland', 'Serbia']
asia          = ['Russia', 'IR Iran', 'Nigeria', 'Korea Republic', 'Saudi Arabia', 'Japan']
africa        = ['Senegal', 'Morocco', 'Tunisia', 'Egypt']
oceania       = ['Australia']
america_norte = ['Panama', 'Mexico', 'Costa Rica']
america_sul   = ['Argentina', 'Peru', 'Uruguay', 'Brazil', 'Colombia']

In [24]:
df = df.withColumn('Continente', when(col('Selecao').isin(europa), 'Europa')\
             .when(col('Selecao').isin(asia), 'Ásia')\
             .when(col('Selecao').isin(africa), 'África')\
             .when(col('Selecao').isin(oceania), 'Oceania')\
             .when(col('Selecao').isin(america_norte), 'América do Norte')\
             .when(col('Selecao').isin(america_sul), 'América do Sul')\
             .otherwise('Verificar'))

In [25]:
df.filter('Continente == "Verificar"').show()

+-------+------+-------+---------+-------------+----+------+----+----------+----------+
|Selecao|Numero|Posicao|Nome_FIFA|Nome_Camiseta|Time|Altura|Peso|Nascimento|Continente|
+-------+------+-------+---------+-------------+----+------+----+----------+----------+
+-------+------+-------+---------+-------------+----+------+----+----------+----------+



In [34]:
df_america_sul = df.filter('Continente = "América do Sul"')
df_america_sul.select('Selecao').distinct().show()

+---------+
|  Selecao|
+---------+
|Argentina|
|     Peru|
|  Uruguay|
|   Brazil|
| Colombia|
+---------+



In [35]:
df_america_norte = df.filter('Continente = "América do Norte"')
df_america_norte.select('Selecao').distinct().show()

+----------+
|   Selecao|
+----------+
|    Panama|
|    Mexico|
|Costa Rica|
+----------+



In [31]:
df_americas = df_america_sul.union(df_america_norte)

In [37]:
df_americas.select('Selecao').distinct().show()

+----------+
|   Selecao|
+----------+
| Argentina|
|      Peru|
|   Uruguay|
|    Brazil|
|  Colombia|
|    Panama|
|    Mexico|
|Costa Rica|
+----------+



In [60]:
df.filter('Selecao == "Brazil"').show()

+-------+------+-------+-----------------+-------------+--------------------+------+----+----------+--------------+
|Selecao|Numero|Posicao|        Nome_FIFA|Nome_Camiseta|                Time|Altura|Peso|Nascimento|    Continente|
+-------+------+-------+-----------------+-------------+--------------------+------+----+----------+--------------+
| Brazil|    18|     MF|             FRED|         FRED|FC Shakhtar Donet...|   169|  64|1993-03-05|América do Sul|
| Brazil|    21|     FW|           TAISON|       TAISON|FC Shakhtar Donet...|   172|  64|1988-01-13|América do Sul|
| Brazil|    17|     MF|      FERNANDINHO|  FERNANDINHO|Manchester City F...|   179|  67|1985-05-04|América do Sul|
| Brazil|    22|     DF|           FAGNER|       FAGNER|SC Corinthians (BRA)|   168|  67|1989-06-11|América do Sul|
| Brazil|    10|     FW|           NEYMAR|    NEYMAR JR|Paris Saint-Germa...|   175|  68|1992-02-05|América do Sul|
| Brazil|    11|     MF|PHILIPPE COUTINHO|  P. COUTINHO|  FC Barcelona (

In [90]:
arg = df.filter(col('Selecao') == "Argentina")
bra = df.filter(col('Selecao') == "Brazil")

In [91]:
arg = arg.drop('Time', 'Continente', 'Peso', 'Nascimento', 'Nome_FIFA')
bra = bra.drop('Time', 'Continente', 'Peso', 'Nascimento', 'Nome_FIFA')

In [92]:
arg.show(5)

+---------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+
|Argentina|     3|     DF|   TAGLIAFICO|   169|
|Argentina|    22|     MF|        PAVÓN|   169|
|Argentina|    15|     MF|      LANZINI|   167|
|Argentina|    18|     DF|       SALVIO|   167|
|Argentina|    10|     FW|        MESSI|   170|
+---------+------+-------+-------------+------+
only showing top 5 rows



In [93]:
bra.show(5)

+-------+------+-------+-------------+------+
|Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+-------+------+-------+-------------+------+
| Brazil|    18|     MF|         FRED|   169|
| Brazil|    21|     FW|       TAISON|   172|
| Brazil|    17|     MF|  FERNANDINHO|   179|
| Brazil|    22|     DF|       FAGNER|   168|
| Brazil|    10|     FW|    NEYMAR JR|   175|
+-------+------+-------+-------------+------+
only showing top 5 rows



#### ***Join simples***

In [94]:
dfnovo = arg.join(bra, arg.Numero == bra.Numero)
dfnovo.show(23)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|     3|     DF|   TAGLIAFICO|   169| Brazil|     3|     DF|      MIRANDA|   186|
|Argentina|    22|     MF|        PAVÓN|   169| Brazil|    22|     DF|       FAGNER|   168|
|Argentina|    15|     MF|      LANZINI|   167| Brazil|    15|     MF|     PAULINHO|   181|
|Argentina|    18|     DF|       SALVIO|   167| Brazil|    18|     MF|         FRED|   169|
|Argentina|    10|     FW|        MESSI|   170| Brazil|    10|     FW|    NEYMAR JR|   175|
|Argentina|     4|     DF|      ANSALDI|   181| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|     5|     MF|       BIGLIA|   175| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     7|     MF|       BANEGA|   175| Brazil|     7|     FW|     D. CO

In [98]:
arg = arg.withColumn('Numero', col('Numero') + 1)
arg.show(23)

+---------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+
|Argentina|     4|     DF|   TAGLIAFICO|   169|
|Argentina|    23|     MF|        PAVÓN|   169|
|Argentina|    16|     MF|      LANZINI|   167|
|Argentina|    19|     DF|       SALVIO|   167|
|Argentina|    11|     FW|        MESSI|   170|
|Argentina|     5|     DF|      ANSALDI|   181|
|Argentina|     6|     MF|       BIGLIA|   175|
|Argentina|     8|     MF|       BANEGA|   175|
|Argentina|    15|     DF|   MASCHERANO|   174|
|Argentina|    22|     FW|       DYBALA|   177|
|Argentina|    20|     FW|       AGÜERO|   172|
|Argentina|    10|     FW|      HIGUAÍN|   184|
|Argentina|    12|     MF|     DI MARÍA|   178|
|Argentina|    21|     MF|     LO CELSO|   177|
|Argentina|    14|     MF|         MEZA|   180|
|Argentina|     9|     DF|        ACUÑA|   172|
|Argentina|    24|     GK|    CABALLERO|   186|
|Argentina|     3|     DF|      MERCADO|

#### ***Inner Join***

In [103]:
dfnovo = arg.join(bra, arg.Numero == bra.Numero, 'inner')
dfnovo.show(23)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|     4|     DF|   TAGLIAFICO|   169| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|    23|     MF|        PAVÓN|   169| Brazil|    23|     GK|      EDERSON|   188|
|Argentina|    16|     MF|      LANZINI|   167| Brazil|    16|     GK|       CASSIO|   195|
|Argentina|    19|     DF|       SALVIO|   167| Brazil|    19|     MF|      WILLIAN|   175|
|Argentina|    11|     FW|        MESSI|   170| Brazil|    11|     MF|  P. COUTINHO|   172|
|Argentina|     5|     DF|      ANSALDI|   181| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE LUIS|   182|
|Argentina|     8|     MF|       BANEGA|   175| Brazil|     8|     MF|   R. AUGU

#### ***Left Join***

In [105]:
dfnovo = arg.join(bra, arg['Numero'] == bra['Numero'], 'left')
dfnovo.show(23)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|     4|     DF|   TAGLIAFICO|   169| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|    23|     MF|        PAVÓN|   169| Brazil|    23|     GK|      EDERSON|   188|
|Argentina|    16|     MF|      LANZINI|   167| Brazil|    16|     GK|       CASSIO|   195|
|Argentina|    19|     DF|       SALVIO|   167| Brazil|    19|     MF|      WILLIAN|   175|
|Argentina|    11|     FW|        MESSI|   170| Brazil|    11|     MF|  P. COUTINHO|   172|
|Argentina|     5|     DF|      ANSALDI|   181| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE LUIS|   182|
|Argentina|     8|     MF|       BANEGA|   175| Brazil|     8|     MF|   R. AUGU

#### ***Right Join***

In [107]:
dfnovo = arg.join(bra, arg['Numero'] == bra['Numero'], 'right')
dfnovo.show(23)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|    18|     DF|     OTAMENDI|   181| Brazil|    18|     MF|         FRED|   169|
|Argentina|    21|     MF|     LO CELSO|   177| Brazil|    21|     FW|       TAISON|   172|
|Argentina|    17|     DF|         ROJO|   189| Brazil|    17|     MF|  FERNANDINHO|   179|
|Argentina|    22|     FW|       DYBALA|   177| Brazil|    22|     DF|       FAGNER|   168|
|Argentina|    10|     FW|      HIGUAÍN|   184| Brazil|    10|     FW|    NEYMAR JR|   175|
|Argentina|    11|     FW|        MESSI|   170| Brazil|    11|     MF|  P. COUTINHO|   172|
|Argentina|     7|     DF|        FAZIO|   199| Brazil|     7|     FW|     D. COSTA|   182|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE L

#### ***Full Join***

In [108]:
dfnovo = arg.join(bra, arg['Numero'] == bra['Numero'], 'full')
dfnovo.show(23)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|     NULL|  NULL|   NULL|         NULL|  NULL| Brazil|     1|     GK|    A. BECKER|   193|
|Argentina|     2|     GK|       GUZMÁN|   192| Brazil|     2|     DF|     T. SILVA|   183|
|Argentina|     3|     DF|      MERCADO|   181| Brazil|     3|     DF|      MIRANDA|   186|
|Argentina|     4|     DF|   TAGLIAFICO|   169| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|     5|     DF|      ANSALDI|   181| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE LUIS|   182|
|Argentina|     7|     DF|        FAZIO|   199| Brazil|     7|     FW|     D. COSTA|   182|
|Argentina|     8|     MF|       BANEGA|   175| Brazil|     8|     MF|   R. AUGU

#### ***Semi Join***

In [109]:
dfnovo = arg.join(bra, arg['Numero'] == bra['Numero'], 'semi')
dfnovo.show(23)

+---------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+---------+------+-------+-------------+------+
|Argentina|     4|     DF|   TAGLIAFICO|   169|
|Argentina|    23|     MF|        PAVÓN|   169|
|Argentina|    16|     MF|      LANZINI|   167|
|Argentina|    19|     DF|       SALVIO|   167|
|Argentina|    11|     FW|        MESSI|   170|
|Argentina|     5|     DF|      ANSALDI|   181|
|Argentina|     6|     MF|       BIGLIA|   175|
|Argentina|     8|     MF|       BANEGA|   175|
|Argentina|    15|     DF|   MASCHERANO|   174|
|Argentina|    22|     FW|       DYBALA|   177|
|Argentina|    20|     FW|       AGÜERO|   172|
|Argentina|    10|     FW|      HIGUAÍN|   184|
|Argentina|    12|     MF|     DI MARÍA|   178|
|Argentina|    21|     MF|     LO CELSO|   177|
|Argentina|    14|     MF|         MEZA|   180|
|Argentina|     9|     DF|        ACUÑA|   172|
|Argentina|     3|     DF|      MERCADO|   181|
|Argentina|    18|     DF|     OTAMENDI|

#### ***Anti Join***

In [111]:
dfnovo = bra.join(arg, arg['Numero'] == bra['Numero'], 'anti')
dfnovo.show(23)

+-------+------+-------+-------------+------+
|Selecao|Numero|Posicao|Nome_Camiseta|Altura|
+-------+------+-------+-------------+------+
| Brazil|     1|     GK|    A. BECKER|   193|
+-------+------+-------+-------------+------+

