# inferSchema


## importaciones
Nota: estas importaciones son necesarias para este notebook, si quiere ver las importaciones necesarias para la función específica vaya al archivo original [inferSchema](../features/inferSchema.py)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.dataframe import DataFrame
import re 
spark = SparkSession.builder.getOrCreate()



## Dataframes y validaciones 
Estos son dataframes de testeo obtenido de internet. 
Las validaciones son una pieza importante de la parte técnica de la librería. Para más detalle vea las  [validaciones](../validaciones/validaciones.py)

In [2]:
df_cars = spark.read.csv('csv/used_cars_data.csv' , header=True )
df_countries = spark.read.csv('csv/countries.csv' ,header= True  )
df_dates = spark.read.csv('csv/US_Holiday_Dates_(2004-2021).csv' , header=True )
df_cop = spark.read.csv('csv/eurocup_2020_results.csv', header= True)
df_test = spark.read.csv('csv/Countries_usefulFeatures.csv' , header= True )
df_ernigs = spark.read.csv('csv/all_earnings_dates.csv' , header= True )
df_null = spark.read.csv('csv/cars_null.csv' , header= True  )
df_seph = spark.read.csv('csv/sephora_website_dataset.csv', header= True)
df_tw = spark.read.csv('csv/most_followed_twitter.csv' , header= True)
df_airlines = spark.read.csv('csv/airlines.csv' , header= True)

def is_dataframe(dataframe):
    try:
        type_df = type(dataframe)
        if not isinstance(dataframe , DataFrame):
            raise TypeError(f"Expected a DataFrame, got {type_df}")
    except Exception as e: 
        print("An error occurred: ", e)
def sel_num_cols(dataframe): 
    lista_columnas_numericas =  []
    tipos_numericos = [LongType().simpleString(), DoubleType().simpleString(), 
        IntegerType().simpleString() , ShortType().simpleString() ,
        FloatType().simpleString() , DecimalType().simpleString()]
    for columnas, dtype in dataframe.dtypes:
        if dtype in tipos_numericos: lista_columnas_numericas.append(columnas)
    
    return lista_columnas_numericas

## InferSchema


In [3]:
def inferSchema(dataframe):
    """
    La función 'inferSchema' es una función que infiere y transforma los tipos de datos de cada columna a los que corresponda.
    La función recorre cada columna tomando el primer tipo de dato de la columna para evaluarlo, dependiendo de esa evaluación
    el tipo de dato de la columna cambiará.
    El patrón que sigue la función para identificar el tipo de dato es verificar caracteres que se obtenga de la muestra, debido a esto
    la función primero transforma todas las columnas a string.

    Argumentos:
    dataframe (pyspark.sql.dataframe.DataFrame): Dataframe al que se le desea hacer un cambio de tipo de datos de cada columna.

     Retorno
    dataframe (pyspark.sql.dataframe.DataFrame): Dataframe con tipo de datos de las columnas inferidas.
    """
    is_dataframe(dataframe) 
    try:
        dataframe = dataframe.select([col(c).cast("string") for c in dataframe.columns])
        for columna in dataframe.columns: 
            first_value = dataframe.select(col(columna)).first()[0]
            if contains_letter(first_value):
                dataframe = dataframe.withColumn(columna , col(columna).cast(StringType())) 
                print(f'{columna} | string -> string')
            elif columna == "Date":
                contador = first_value.count(".")
                if '-' in first_value:
                    dataframe = dataframe.withColumn(columna , col(columna).cast(DateType()))  
                    print(f'{columna} | string -> date')     
                elif contador == 2: 
                    dataframe = dataframe.withColumn(columna, regexp_replace(dataframe[columna], "[^a-zA-Z0-9]+", ""))
                    dataframe = dataframe.withColumn(columna , col(columna).cast((IntegerType()))) 
                    print(f'{columna} | string -> integer')
            elif '.' in first_value:
                contador = first_value.count(".")
                if contador == 2: 
                    dataframe = dataframe.withColumn(columna, regexp_replace(dataframe[columna], "[^a-zA-Z0-9]+", ""))
                    dataframe = dataframe.withColumn(columna , col(columna).cast((IntegerType()))) 
                    print(f'{columna} | string -> date')
                elif contador == 1:
                    dataframe = dataframe.withColumn(columna , col(columna).cast(DoubleType()))
                    print(f'{columna} | string -> double') 
                else:
                    dataframe = dataframe.withColumn(columna , col(columna).cast(StringType()))   
                    print(f'{columna} | string -> string')
            elif "-"  in first_value:
                contador = first_value.count("-")
                if contador == 2:
                    dataframe = dataframe.withColumn(columna , col(columna).cast(DateType()))  
                    print(f'{columna} | string -> date')
                else: 
                    dataframe = dataframe.withColumn(columna , col(columna).cast(StringType()))   
                    print(f'{columna} | string -> string')
            elif "/" in first_value:
                contador = first_value.count("/")
                if contador == 2:
                    dataframe = dataframe.withColumn(columna , col(columna).cast(DateType()))
                    print(f'{columna} | string -> date')
                else: 
                    dataframe = dataframe.withColumn(columna , col(columna).cast(StringType()))   
                    print(f'{columna} | string -> string')

            elif "True" in first_value:
                dataframe = dataframe.withColumn(columna , col(columna).cast(BooleanType()))
                print(f'{columna} | string -> bool')
            elif "False" in first_value:
                dataframe = dataframe.withColumn(columna , col(columna).cast(BooleanType()))
                print(f'{columna} | string -> bool')
            elif first_value.isnumeric():
                dataframe = dataframe.withColumn(columna , col(columna).cast(LongType()))
                try:
                    dataframe = dataframe.withColumn(columna , col(columna).cast((IntegerType())))
                    print(f'{columna} | string -> integer')
                except:
                    dataframe = dataframe.withColumn(columna , col(columna).cast(LongType()))
                    print(f'{columna} | string -> long')
                    pass
            else:
                dataframe = dataframe.withColumn(columna , col(columna).cast(StringType()))   
                print(f'{columna} | string -> string')
        return dataframe
    except Exception as e :
        print('Ha ocurrido un erro al momento de inferir el schema: ' , e )

def contains_letter(string):
    return bool(re.search("[a-zA-Z]", string))


  



## Testeo 
Se va a inferir el schema de cada dataframe y luego veremos el detalle del dataframe y sus tipos de datos 

In [4]:
df_infer1 = inferSchema(df_cars)
df_infer2 = inferSchema(df_countries)
df_infer3 = inferSchema(df_ernigs)
df_infer4 = inferSchema(df_dates)
df_infer5 = inferSchema(df_cop)
df_infer6 = inferSchema(df_tw)
df_infer7 = inferSchema(df_airlines)
df_infer8 = inferSchema(df_seph)

_c0 | string -> integer
brand | string -> string
model | string -> string
price (eur) | string -> integer
engine | string -> string
year | string -> integer
mileage (kms) | string -> integer
fuel | string -> string
gearbox | string -> string
location | string -> string
county_FIPS | string -> integer
state | string -> string
county | string -> string
per_capita_personal_income_2019 | string -> integer
per_capita_personal_income_2020 | string -> integer
per_capita_personal_income_2021 | string -> integer
associate_degree_numbers_2016_2020 | string -> integer
bachelor_degree_numbers_2016_2020 | string -> integer
associate_degree_percentage_2016_2020 | string -> double
bachelor_degree_percentage_2015_2019 | string -> double
filingDate | string -> date
reportDate | string -> date
ticker | string -> string
form | string -> string
Date | string -> date
Holiday | string -> string
WeekDay | string -> string
Month | string -> integer
Day | string -> integer
Year | string -> integer
stage | stri

In [5]:
df_infer1.show(2)
df_infer1.printSchema()

+---+-------+-----+-----------+--------------------+----+-------------+--------+-------+----------+
|_c0|  brand|model|price (eur)|              engine|year|mileage (kms)|    fuel|gearbox|  location|
+---+-------+-----+-----------+--------------------+----+-------------+--------+-------+----------+
|  0|   SEAT|Ibiza|       8990|SC 1.2 TSI 90cv S...|2016|        67000|Gasolina| Manual|Granollers|
|  1|Hyundai|  i30|       9990|1.6 CRDi 110cv Tecno|2014|       104868|  Diésel| Manual|Viladecans|
+---+-------+-----+-----------+--------------------+----+-------------+--------+-------+----------+
only showing top 2 rows

root
 |-- _c0: integer (nullable = true)
 |-- brand: string (nullable = true)
 |-- model: string (nullable = true)
 |-- price (eur): integer (nullable = true)
 |-- engine: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- mileage (kms): integer (nullable = true)
 |-- fuel: string (nullable = true)
 |-- gearbox: string (nullable = true)
 |-- location: strin

In [6]:
df_infer2.show(2)
df_infer2.printSchema()

+-----------+-----+--------------+-------------------------------+-------------------------------+-------------------------------+----------------------------------+---------------------------------+-------------------------------------+------------------------------------+
|county_FIPS|state|        county|per_capita_personal_income_2019|per_capita_personal_income_2020|per_capita_personal_income_2021|associate_degree_numbers_2016_2020|bachelor_degree_numbers_2016_2020|associate_degree_percentage_2016_2020|bachelor_degree_percentage_2015_2019|
+-----------+-----+--------------+-------------------------------+-------------------------------+-------------------------------+----------------------------------+---------------------------------+-------------------------------------+------------------------------------+
|      51013|   VA| Arlington, VA|                          97629|                         100687|                         107603|                             19573|          

In [7]:
df_infer3.show(2)
df_infer3.printSchema()

+----------+----------+------+----+
|filingDate|reportDate|ticker|form|
+----------+----------+------+----+
|2008-07-11|2008-05-31|   AIR|10-K|
|2007-07-20|2007-05-31|   AIR|10-K|
+----------+----------+------+----+
only showing top 2 rows

root
 |-- filingDate: date (nullable = true)
 |-- reportDate: date (nullable = true)
 |-- ticker: string (nullable = true)
 |-- form: string (nullable = true)



In [8]:
df_infer4.show(2)
df_infer4.printSchema()

+----------+-----------+-------+-----+---+----+
|      Date|    Holiday|WeekDay|Month|Day|Year|
+----------+-----------+-------+-----+---+----+
|2004-07-04|4th of July| Sunday|    7|  4|2004|
|2005-07-04|4th of July| Monday|    7|  4|2005|
+----------+-----------+-------+-----+---+----+
only showing top 2 rows

root
 |-- Date: date (nullable = true)
 |-- Holiday: string (nullable = true)
 |-- WeekDay: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Year: integer (nullable = true)



In [9]:
df_infer5.show(2)
df_infer5.printSchema()

+-------------+--------+-----+---------------+---------------+--------------+--------------+---------------+---------------+---------------+---------------+----------------+----------------+--------------------+--------------------+--------------+--------------+--------------------+--------------------+--------------------+
|        stage|    date| pens|pens_home_score|pens_away_score|team_name_home|team_name_away|team_home_score|team_away_score|possession_home|possession_away|total_shots_home|total_shots_away|shots_on_target_home|shots_on_target_away|duels_won_home|duels_won_away|         events_list|         lineup_home|         lineup_away|
+-------------+--------+-----+---------------+---------------+--------------+--------------+---------------+---------------+---------------+---------------+----------------+----------------+--------------------+--------------------+--------------+--------------+--------------------+--------------------+--------------------+
|       Final |1107202

In [10]:
df_infer6.show(2)
df_infer6.printSchema()

+----+------------+-------------+------------------+---------------+--------------------+-------------+
|Rank|Account Name|Account Owner|Brand Account(Y/N)|Followers(in M)|          Occupation|      Country|
+----+------------+-------------+------------------+---------------+--------------------+-------------+
|   1|@BarackObama| Barack Obama|                 N|          133.4|44th President of...|United States|
|   2|   @elonmusk|    Elon Musk|                 N|          123.3|Business magnate,...| South Africa|
+----+------------+-------------+------------------+---------------+--------------------+-------------+
only showing top 2 rows

root
 |-- Rank: integer (nullable = true)
 |-- Account Name: string (nullable = true)
 |-- Account Owner: string (nullable = true)
 |-- Brand Account(Y/N): string (nullable = true)
 |-- Followers(in M): double (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Country: string (nullable = true)



In [11]:
df_infer7.show(2)
df_infer7.printSchema()

+----------+--------------+-----+----+----+--------+-------+------+
|Airline ID|          Name|Alias|IATA|ICAO|Callsign|Country|Active|
+----------+--------------+-----+----+----+--------+-------+------+
|        -1|       Unknown|   \N|   -| N/A|      \N|     \N|     Y|
|         1|Private flight|   \N|   -| N/A|    null|   null|     Y|
+----------+--------------+-----+----+----+--------+-------+------+
only showing top 2 rows

root
 |-- Airline ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Alias: string (nullable = true)
 |-- IATA: string (nullable = true)
 |-- ICAO: string (nullable = true)
 |-- Callsign: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Active: string (nullable = true)



In [12]:
df_infer8.show(2)
df_infer8.printSchema()

+-------+--------------+---------+--------------------+--------------+------+-----------------+----+-----+-----------+--------------------+--------------+----------------------+--------------------+--------------------+--------------------+--------------------+-----------+---------+---------------+------------------+
|     id|         brand| category|                name|          size|rating|number_of_reviews|love|price|value_price|                 URL|MarketingFlags|MarketingFlags_content|             options|             details|          how_to_use|         ingredients|online_only|exclusive|limited_edition|limited_time_offer|
+-------+--------------+---------+--------------------+--------------+------+-----------------+----+-----+-----------+--------------------+--------------+----------------------+--------------------+--------------------+--------------------+--------------------+-----------+---------+---------------+------------------+
|2218774|Acqua Di Parma|Fragrance|Blu Medit