In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql import Window
from pyspark.sql.functions import *

### Se inicia Session en SPARK

In [2]:
spark = SparkSession \
   .builder \
   .config("spark.some.config.option", "some-values") \
   .getOrCreate()

### Se carga el archivo demo.csv

In [3]:
df_demo = spark.read.format("csv").option("header", "true").load("data/demo.csv")

In [4]:
df_demo.printSchema()

root
 |-- airline_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- alias: string (nullable = true)
 |-- iata: string (nullable = true)
 |-- icao: string (nullable = true)
 |-- callsign: string (nullable = true)
 |-- country: string (nullable = true)
 |-- active: string (nullable = true)



### 1. Generar un diccionario a partir de la columna country (country_id,country)

### Se crea la tabla country

In [5]:
table_country = df_demo.select(F.upper(F.col("country")))\
                       .filter("country is not NULL").distinct().orderBy("country")\
                       .withColumnRenamed("upper(country)", "country")

### Se agrega un identificador a la tabla

In [6]:
window = Window.orderBy(F.col('country'))

In [7]:
table_country = table_country.withColumn('country_id', F.row_number().over(window))

In [8]:
table_country.printSchema()

root
 |-- country: string (nullable = true)
 |-- country_id: integer (nullable = true)



In [9]:
table_country = table_country.select("country_id","country")

### 2. Escribir el diccionario como CSV desde spark

### Se crea el archivo country.csv

In [10]:
table_country.write.csv('data/country', header="true")

### Se lee el archivo country.csv

In [12]:
df_country = spark.read.format("csv").option("header", "true").load("data/country/country.csv")

### 3. Leer el CSV generado y hacer inner join con demo.csv

### Se crea JOIN

In [13]:
condicion = [df_demo.country == df_country.country]

In [14]:
df_join = df_demo.join(df_country, condicion, "left_outer")

In [15]:
df_join.printSchema()

root
 |-- airline_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- alias: string (nullable = true)
 |-- iata: string (nullable = true)
 |-- icao: string (nullable = true)
 |-- callsign: string (nullable = true)
 |-- country: string (nullable = true)
 |-- active: string (nullable = true)
 |-- country_id: string (nullable = true)
 |-- country: string (nullable = true)



In [16]:
df_join.count()

6162

In [17]:
df_demo.count()

6162

### Parar session de SPARK

In [18]:
spark.stop()