In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('spark-pract').getOrCreate()

In [4]:
df_spark = spark.read.csv('food_consumption.csv',header=True,inferSchema=True)

In [5]:
df_spark.head(5)

[Row(country='Argentina', food_category='Pork', consumption=10.51, co2_emmission=37.2),
 Row(country='Argentina', food_category='Poultry', consumption=38.66, co2_emmission=41.53),
 Row(country='Argentina', food_category='Beef', consumption=55.48, co2_emmission=1712.0),
 Row(country='Argentina', food_category='Lamb & Goat', consumption=1.56, co2_emmission=54.63),
 Row(country='Argentina', food_category='Fish', consumption=4.36, co2_emmission=6.96)]

In [6]:
df_spark.printSchema()

root
 |-- country: string (nullable = true)
 |-- food_category: string (nullable = true)
 |-- consumption: double (nullable = true)
 |-- co2_emmission: double (nullable = true)



In [20]:
null_age_df = df_spark.filter(df_spark.country.isNull())
null_age_df.count()

10

In [22]:
df_spark.filter(df_spark.country.isNull()).show()

+-------+-------------+-----------+-------------+
|country|food_category|consumption|co2_emmission|
+-------+-------------+-----------+-------------+
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
|   NULL|         NULL|       NULL|         NULL|
+-------+-------------+-----------+-------------+



In [26]:
df_spark.count()

1430

In [25]:
df_spark.na.drop().count()

1420

In [27]:
### any==how
df_spark.na.drop(how="all").count()

1420

In [29]:
##threshold

df_spark.na.drop(how="any",thresh=3).count()

1420

In [30]:
df_spark.na.drop(how="any",subset=['country']).count()

1420

In [13]:
#Fill missing values

In [15]:
from pyspark.ml.feature import Imputer

In [16]:
df_spark.columns

['country', 'food_category', 'consumption', 'co2_emmission']

In [31]:

imputer_median = Imputer(
    inputCols=['consumption', 'co2_emmission'],
    outputCols=['consumption_imputed', 'co2_emmission_imputed']
).setStrategy("median")

In [32]:
from pyspark.sql.functions import col, count, desc

# Function to get mode
def get_mode(df, col_name):
    return df.groupBy(col_name).count().orderBy(desc("count")).first()[0]

mode_country = get_mode(df_spark, "country")
mode_food_category = get_mode(df_spark, "food_category")


In [33]:
mode_country , mode_food_category

('Russia', 'Wheat and Wheat Products')

In [34]:
df_spark_filled = df_spark.fillna({
    "country": mode_country,
    "food_category": mode_food_category
})


In [35]:
df_final = imputer_median.fit(df_spark_filled).transform(df_spark_filled)

In [37]:
df_final[df_final.country.isNull()].count()

0