# PR0503. Limpieza de datos sobre el dataset de cultivos

In [1]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
schema = StructType([
    StructField("Crop", StringType(), False),
    StructField("Region", StringType(), False),
    StructField("Soil_Type", StringType(), False),
    StructField("Soil_pH", DoubleType(), False),
    StructField("Rainfall_mm", DoubleType(), False),
    StructField("Temperature_C", DoubleType(), False),
    StructField("Humidity_pct", DoubleType(), False),
    StructField("Fertilizar_Used_kg", DoubleType(), False),
    StructField("Irrigation", StringType(), False),
    StructField("Pesticides_Used_kg", DoubleType(), False),
    StructField("Planting_Density", DoubleType(), False),
    StructField("Previous_Crop", StringType(), False),
    StructField("Yield_ton_per_ha", DoubleType(), False),
])
schema.fields

[StructField('Crop', StringType(), False),
 StructField('Region', StringType(), False),
 StructField('Soil_Type', StringType(), False),
 StructField('Soil_pH', DoubleType(), False),
 StructField('Rainfall_mm', DoubleType(), False),
 StructField('Temperature_C', DoubleType(), False),
 StructField('Humidity_pct', DoubleType(), False),
 StructField('Fertilizar_Used_kg', DoubleType(), False),
 StructField('Irrigation', StringType(), False),
 StructField('Pesticides_Used_kg', DoubleType(), False),
 StructField('Planting_Density', DoubleType(), False),
 StructField('Previous_Crop', StringType(), False),
 StructField('Yield_ton_per_ha', DoubleType(), False)]

In [2]:
from pyspark.sql import SparkSession
spark = ( SparkSession.builder
            .appName("pruebas")
            .master("spark://spark-master:7077")
            .getOrCreate()
        )
df = (
    spark.read
        .format("csv")
        .schema(schema)
        .option("header", "true")
        .load("/workspace/pr0501/crop_yield_dataset.csv")
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/22 10:09:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 1. Creación de un ID único

In [11]:
from pyspark.sql.functions import col, concat_ws, lit, split, upper, lpad
df_eng = df.withColumn("Crop_ID", concat_ws("-", lit("CODIGO_"), lpad(split(col("Region"), "_")[1], 3, "X"), upper(col("Crop"))))
df_eng.show(5)

+------+--------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+------------------+
|  Crop|  Region|Soil_Type|Soil_pH|Rainfall_mm|Temperature_C|Humidity_pct|Fertilizar_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|           Crop_ID|
+------+--------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+------------------+
| Maize|Region_C|    Sandy|   7.01|     1485.4|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|          101.48| CODIGO_-XXC-MAIZE|
|Barley|Region_D|     Loam|   5.79|      399.4|         29.1|        55.4|             221.8| Sprinkler|              35.5|             7.4|       Barley|          127.39|CODIGO_-XXD-BARLEY|
|  Rice|Region_C|     Clay|   7.24|      980.

## 2. Transformación matemática

In [12]:
from pyspark.sql.functions import log10, round
df_eng = (
    df_eng 
    .withColumn("Log_Rainfall", log10(col("Rainfall_mm") + 1))
    .withColumn("Yield_ton_per_ha", round(col("Yield_ton_per_ha"), 2))
        )
df_eng.show(5)

+------+--------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+------------------+------------------+
|  Crop|  Region|Soil_Type|Soil_pH|Rainfall_mm|Temperature_C|Humidity_pct|Fertilizar_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|           Crop_ID|      Log_Rainfall|
+------+--------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+------------------+------------------+
| Maize|Region_C|    Sandy|   7.01|     1485.4|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|          101.48| CODIGO_-XXC-MAIZE|3.1721356966495664|
|Barley|Region_D|     Loam|   5.79|      399.4|         29.1|        55.4|             221.8| Sprinkler|              35.5|             7.4|       Barley|      

## 3. Comparación de insumos

## 4. Simulación de fechas