# count_group
Notebook con ejemplos de uso de la funcion count_group 

## Importanciones necesarias 
Nota: estas importaciones son necesarias para este notebook, si quiere ver las importaciones necesarias para la función específica vaya al archivo original [count_group](../features/conteo.py)

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame
spark = SparkSession.builder.getOrCreate()

## dataframes y validaciones
Estos son dataframes de testeo obtenido de internet. 
Las validaciones son una pieza importante de la parte técnica de la librería. Para más detalle vea las [validaciones](../validaciones/validaciones.py)


In [10]:

df_cars = spark.read.csv('csv/used_cars_data.csv' , header=True,inferSchema=True )
df_countries = spark.read.csv('csv/countries.csv' ,header= True ,inferSchema=True )
df_dates = spark.read.csv('csv/US_Holiday_Dates_(2004-2021).csv' , header=True,inferSchema=True )
df_cop = spark.read.csv('csv/eurocup_2020_results.csv', header= True,inferSchema=True)
df_test = spark.read.csv('csv/Countries_usefulFeatures.csv' , header= True ,inferSchema=True)
df_ernigs = spark.read.csv('csv/all_earnings_dates.csv' , header= True ,inferSchema=True)
df_null = spark.read.csv('csv/cars_null.csv' , header= True  ,inferSchema=True)
df_seph = spark.read.csv('csv/sephora_website_dataset.csv', header= True,inferSchema=True)
df_tw = spark.read.csv('csv/most_followed_twitter.csv' , header= True ,inferSchema=True)
df_airlines = spark.read.csv('csv/airlines.csv' , header= True ,inferSchema=True)

def is_dataframe(dataframe):
    try:
        type_df = type(dataframe)
        if not isinstance(dataframe , DataFrame):
            raise TypeError(f"Expected a DataFrame, got {type_df}")
    except Exception as e: 
        print("An error occurred: ", e)
def sel_num_cols(dataframe): 
    lista_columnas_numericas =  []
    tipos_numericos = [LongType().simpleString(), DoubleType().simpleString(), 
        IntegerType().simpleString() , ShortType().simpleString() ,
        FloatType().simpleString() , DecimalType().simpleString()]
    for columnas, dtype in dataframe.dtypes:
        if dtype in tipos_numericos: lista_columnas_numericas.append(columnas)
    
    return lista_columnas_numericas
def df_has_numtype(dataframe):
    try:
        is_dataframe(dataframe)
        tipos_numericos = [LongType().simpleString(), DoubleType().simpleString(), 
        IntegerType().simpleString() , ShortType().simpleString() ,
        FloatType().simpleString() , DecimalType().simpleString()]
        for _ , dtype in dataframe.dtypes: 
            if dtype in tipos_numericos:  return True 
        return False
    except Exception as e:
        print("An error occurred: ", e)
def df_has_null(dataframe): 
    try:    
        is_dataframe(dataframe)
        for col in dataframe.columns:
            if dataframe.filter(isnull(col)).count() > 0 : return (True , col)
        return (False , None)
    except Exception as e: 
        print("An error occurred: ", e)

## Ejemplos de uso
Estos son ejemplos de uso en varios datasets.
Importante mencionar que el dataframe resultante va a tener la cantidad de columnas dependiendo de la cantidad de columnas numéricas que este tenga.

In [11]:
def group_count(dataframe, group_cols):
    """
    Esta función recibe un dataframe de PySpar y una lista de columnas a las que se le hará un conteo,
    se hará el cálculo de la suma y el avg de las columnas numéricas.
    La función devuelve un dataframe con las columnas agregas de conteo, suma y avg de cada columna numerica.
    Parameters:
    dataframe (DataFrame): Dataframe de PySpark que se desea imputar
    group_cols (list): Lista de los nombres de las columnas para agrupar y contar
    Returns:
    DataFrame: Dataframe de PySpark con las nuevas columnas de conteo y cálculos matemáticos.
    """
    try:
        is_dataframe(dataframe)
        lista_columnas_numericas = sel_num_cols(dataframe)
        df_agrupado = dataframe.groupBy(*group_cols).agg(count("*").alias("t_count"))
        lista_expresiones_agregacion = [count("*").alias("t_count")]
        for columnas_numericas in lista_columnas_numericas:
            lista_expresiones_agregacion.append(sum(columnas_numericas).alias('sum_'+columnas_numericas))
            lista_expresiones_agregacion.append(avg(columnas_numericas).alias('avg_'+columnas_numericas))
        df_agrupado = dataframe.groupBy(*group_cols).agg(*lista_expresiones_agregacion)
        df_agrupado.show()
        return df_agrupado
    except Exception as e: 
        print('Ha ocurrido un error al realizar el conteo: ' , e)

In [12]:
lista1 = ['brand' , 'model']
df_conteo1 = group_count(df_cars , lista1)
df_conteo1.show()

+--------+---------------+-------+-------+------------------+---------------+------------------+--------+------------------+-----------------+------------------+
|   brand|          model|t_count|sum__c0|           avg__c0|sum_price (eur)|   avg_price (eur)|sum_year|          avg_year|sum_mileage (kms)| avg_mileage (kms)|
+--------+---------------+-------+-------+------------------+---------------+------------------+--------+------------------+-----------------+------------------+
| Hyundai|            i20|      2|    844|             422.0|          25170|           12585.0|    4037|            2018.5|            70876|           35438.0|
| Citroen|             C4|      5|   2102|             420.4|          42950|            8590.0|   10070|            2014.0|           684234|          136846.8|
|     Kia|           Ceed|     11|   5108| 464.3636363636364|         160990|14635.454545454546|   22204|2018.5454545454545|           628993|57181.181818181816|
| Hyundai|            i30|  

In [13]:
lista2 = ['team_name_home' , 'pens_home_score']
df_conteo2 = group_count(df_cop , lista2)
df_conteo2.show()

+----------------+---------------+-------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------------+------------------------+------------------------+------------------------+
|  team_name_home|pens_home_score|t_count|sum_team_home_score|avg_team_home_score|sum_team_away_score|avg_team_away_score|sum_total_shots_home|avg_total_shots_home|sum_total_shots_away|avg_total_shots_away|sum_shots_on_target_home|avg_shots_on_target_home|sum_shots_on_target_away|avg_shots_on_target_away|
+----------------+---------------+-------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------------+------------------------+------------------------+------------------------+
|    Switzerland |          False|      1|                  3|                3

In [14]:
lista3 = ['Date','WeekDay']
df_conteo3 = group_count(df_dates , lista3)
df_conteo3.show()

+-------------------+---------+-------+---------+---------+-------+-------+--------+--------+
|               Date|  WeekDay|t_count|sum_Month|avg_Month|sum_Day|avg_Day|sum_Year|avg_Year|
+-------------------+---------+-------+---------+---------+-------+-------+--------+--------+
|2016-07-04 00:00:00|   Monday|      1|        7|      7.0|      4|    4.0|    2016|  2016.0|
|2013-12-25 00:00:00|Wednesday|      1|       12|     12.0|     25|   25.0|    2013|  2013.0|
|2013-09-01 00:00:00|   Sunday|      1|        9|      9.0|      1|    1.0|    2013|  2013.0|
|2013-01-01 00:00:00|  Tuesday|      1|        1|      1.0|      1|    1.0|    2013|  2013.0|
|2017-02-20 00:00:00|   Monday|      1|        2|      2.0|     20|   20.0|    2017|  2017.0|
|2004-09-04 00:00:00| Saturday|      1|        9|      9.0|      4|    4.0|    2004|  2004.0|
|2008-11-26 00:00:00|Wednesday|      1|       11|     11.0|     26|   26.0|    2008|  2008.0|
|2007-06-19 00:00:00|  Tuesday|      1|        6|      6.0| 

In [15]:
lista4 = ['Country','Active']
df_conteo4 = group_count(df_airlines, lista4)
df_conteo4.show()

+--------------------+------+-------+--------------+------------------+
|             Country|Active|t_count|sum_Airline ID|    avg_Airline ID|
+--------------------+------+-------+--------------+------------------+
|               Kenya|     Y|      4|         31884|            7971.0|
|         New Zealand|     Y|      4|         18188|            4547.0|
|             ASTORIA|     N|      1|           356|             356.0|
|            Suriname|     N|      5|         15489|            3097.8|
|             Ecuador|     Y|      7|         23823| 3403.285714285714|
|                ACOM|     N|      1|          4456|            4456.0|
|Netherlands Antilles|     N|      4|         23963|           5990.75|
|           Venezuela|     Y|      4|         27058|            6764.5|
|            Thailand|     N|     33|        131639| 3989.060606060606|
|  ATLANTIC NICARAGUA|     N|      1|           592|             592.0|
|              Uganda|     N|     21|         56886|2708.8571428

In [16]:
lista5 = ['county_FIPS','state']
df_conteo5 = group_count(df_countries, lista5)
df_conteo5.show()

+-----------+-----+-------+---------------+---------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+--------------------------------------+--------------------------------------+-------------------------------------+-------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------+----------------------------------------+
|county_FIPS|state|t_count|sum_county_FIPS|avg_county_FIPS|sum_per_capita_personal_income_2019|avg_per_capita_personal_income_2019|sum_per_capita_personal_income_2020|avg_per_capita_personal_income_2020|sum_per_capita_personal_income_2021|avg_per_capita_personal_income_2021|sum_associate_degree_numbers_2016_2020|avg_associate_degree_numbers_2016_2020|sum_bachelor_degree_numbers_2016_2020|avg_bachelor_