
# Workshop IPL Causality - Data Preprocess V1

## Data information

- One file for all countries metrics
- One file for yield data
- One file for quality




In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import time
from datetime import datetime
import seaborn as sns
import sys
import os
from sklearn import metrics
# specific 
# add project root folder to sys.path
# sys.path.append(os.path.abspath('..'))
# from shared_utils import utils_functions as uf

In [2]:
# Mostrar todas las columnas en un DataFrame
pd.set_option('display.max_columns', None)

# Evitar que el DataFrame se corte en múltiples líneas
pd.set_option('display.expand_frame_repr', False)

# Controlar el número máximo de filas que se muestran
pd.set_option('display.max_rows', 100)

# Ajustar el ancho máximo de cada columna
pd.set_option('display.max_colwidth', 50)

### Data Context

Crop: Maize

Countries: 40 countries (all countries in which Maize is one of top3 national crops).

Pre-process: We are using information from the clustering experiment preparation in which all countries for maize, rice and wheat were processed and yield was modeled.

Variables
- Meteo (rain, temperature, radiation)
- FPAR
- Soil Moisture

## Get model input (metrics)
  

In [6]:
# Read csv with best cropmask per crop-country
csv_metrics = "best_cropmask_crop_country_metrics.csv"
try:
    df_metrics = pd.read_csv(csv_metrics)
    print(df_metrics.head())
except Exception as e:
    print(e)



           crop country  country_id  season_FAOStatyear    season season_type cropmask_option  asap:fpar_max  asap:rain_sum  asap:rad_sum  asap:temp_mean  asap:sm_mean      PCA1
0  Maize (corn)  Angola          92                2001  season_1        main         option2       0.638221     885.816096  3.740097e+06       20.431563      0.249967 -0.966520
1  Maize (corn)  Angola          92                2002  season_1        main         option2       0.644982     975.633915  3.773766e+06       20.763087      0.252809 -0.731392
2  Maize (corn)  Angola          92                2003  season_1        main         option2       0.635370     890.016960  3.769446e+06       20.699059      0.260046 -0.485087
3  Maize (corn)  Angola          92                2004  season_1        main         option2       0.622427     927.546693  3.774797e+06       20.499467      0.264643 -0.769711
4  Maize (corn)  Angola          92                2005  season_1        main         option2       0.643485  

Note: Trend features is obtained using yield data and create_trend_features()

In [10]:
def create_trend_features(df, num_years, years_to_skip):
    dfx = df.copy()
    for i in range(1, num_years):
        column_name = f"yt_{i}"
        # Ajusta el desplazamiento teniendo en cuenta los años a omitir
        adjusted_shift = i + years_to_skip
        dfx.loc[:, column_name] = dfx["yield"].shift(adjusted_shift)
    return dfx

## Get Yield Data

In [8]:
csv_yield = "yield_data_all_crops_clustering.csv"
try:
    df_yield = pd.read_csv(csv_yield)
    print(df_yield.head())
except Exception as e:
    print(e)


  country  country_id          crop  year  yield
0  Angola          92  Maize (corn)  1961  811.3
1  Angola          92  Maize (corn)  1962  811.3
2  Angola          92  Maize (corn)  1963  769.2
3  Angola          92  Maize (corn)  1964  865.4
4  Angola          92  Maize (corn)  1965  937.5


In [15]:
my_country_id = 92
years_to_trend = 3
years_to_skip = 1
initial_year = 1995
df_x = df_yield[(df_yield["country_id"]==my_country_id) & (df_yield["year"]> initial_year)]
print(df_x.shape)
df_x.head()

(27, 5)


Unnamed: 0,country,country_id,crop,year,yield
35,Angola,92,Maize (corn),1996,699.3
36,Angola,92,Maize (corn),1997,595.9
37,Angola,92,Maize (corn),1998,739.6
38,Angola,92,Maize (corn),1999,636.1
39,Angola,92,Maize (corn),2000,574.5


In [16]:
trend_x = create_trend_features(df=df_x[['year', 'yield']], num_years=years_to_trend, years_to_skip=years_to_skip).dropna()
trend_x.head()

Unnamed: 0,year,yield,yt_1,yt_2
38,1999,636.1,595.9,699.3
39,2000,574.5,739.6,595.9
40,2001,575.4,636.1,739.6
41,2002,670.6,574.5,636.1
42,2003,755.9,575.4,574.5


## Get Quality Data

In [9]:
csv_quality = "crop_country_quality_flags.csv"
try:
    df_quality = pd.read_csv(csv_quality)
    print(df_quality.head())
except Exception as e:
    print(e)

           Crop    Country  country_id  giews_yield_data_quality  jrc_yield_data_fsv_threshold jrc_crop_calendar_reliability
0  Maize (corn)   Honduras         100                         2                             1                             L
1  Maize (corn)      Nepal         190                         3                             0                             L
2  Maize (corn)   Paraguay         178                         4                             1                             N
3  Maize (corn)  Argentina         166                         4                             3                             N
4  Maize (corn)     Angola          92                         2                             6                             H
