In [11]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import statistics as st

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.model_selection import train_test_split

In [2]:
def clean_raw_df_production():
    df = pd.read_csv('Data/fao_data_production_indices_data.csv')
    headers = ["Region", "Element Code", "Production", "Year", "Unit", "Dollar Amount", "Value Footnotes", "Category"]
    df.columns = headers
    
    year_med = st.median(df['Year'])
    values = {'Year': year_med, 'Production': 'Gross', 'Unit': 'Int. $'}
    df.fillna(value=values, inplace=True)
    df.dropna(subset=['Dollar Amount'], inplace=True)
    df = df[df['Dollar Amount'] != 0]
    
    df[["Year", "Dollar Amount"]] = df[["Year", "Dollar Amount"]].astype("int")
    df.reset_index(inplace=True)
    
    df.drop(columns=['index', 'Value Footnotes'], inplace=True)

    df.replace({'Gross Production 1999-2001 (1000 I$)':'Gross Production',
                'Net Production 1999-2001 (1000 I$)' : 'Net Production', 
                'Gross PIN (base 1999-2001)' : 'Gross PIN', 
                'Grs per capita PIN (base 1999-2001)':'Gross Per Capita PIN', 
                'Net PIN (base 1999-2001)':'Net PIN', 
                'Net per capita PIN (base 1999-2001)':'Net Per Capita PIN'}, inplace=True)
    
    binary_col = []
    for val in df['Production']:
        if 'Net' in val:
            binary_col.append(0)
        elif 'Gross' in val:
            binary_col.append(1)
        else:
            binary_col.append(-1)
        
    df['Gross/Net Binary'] = binary_col      
                     
    return df

In [3]:
df = clean_raw_df_production()
df.head()

Unnamed: 0,Region,Element Code,Production,Year,Unit,Dollar Amount,Category,Gross/Net Binary
0,Afghanistan,152,Gross Production,2007,1000 Int. $,2486910,agriculture_pin,1
1,Afghanistan,152,Gross Production,2006,1000 Int. $,2278516,agriculture_pin,1
2,Afghanistan,152,Gross Production,2005,1000 Int. $,2524097,agriculture_pin,1
3,Afghanistan,152,Gross Production,2004,1000 Int. $,2226346,agriculture_pin,1
4,Afghanistan,152,Gross Production,2003,1000 Int. $,2289434,agriculture_pin,1


In [4]:
gross_df = df.drop(df[df['Production'] != 'Gross Production'].index)
gross_df.head()

Unnamed: 0,Region,Element Code,Production,Year,Unit,Dollar Amount,Category,Gross/Net Binary
0,Afghanistan,152,Gross Production,2007,1000 Int. $,2486910,agriculture_pin,1
1,Afghanistan,152,Gross Production,2006,1000 Int. $,2278516,agriculture_pin,1
2,Afghanistan,152,Gross Production,2005,1000 Int. $,2524097,agriculture_pin,1
3,Afghanistan,152,Gross Production,2004,1000 Int. $,2226346,agriculture_pin,1
4,Afghanistan,152,Gross Production,2003,1000 Int. $,2289434,agriculture_pin,1


In [5]:
gross_agri_prod = gross_df[['Region', 'Year', 'Dollar Amount', 'Category']]
gross_agri_prod.sample(5)


Unnamed: 0,Region,Year,Dollar Amount,Category
214881,Iceland,1964,67117,livestock_pin
245730,Tonga,2001,2972,livestock_pin
173629,Saint Vincent and Grenadines,1972,12359,food_pin
183126,Tunisia,1965,790060,food_pin
190621,American Samoa,1994,385,livestock_pin


In [10]:
em_top_countries = ['China', 'India', 'United States of America', 'Brazil', 'Russian Federation']
fin_df = gross_agri_prod.loc[gross_agri_prod['Region'].apply(lambda x: x in em_top_countries)]
fin_df.reset_index(inplace=True)
fin_df.drop(columns='index', inplace=True)
fin_df.head()

Unnamed: 0,Region,Year,Dollar Amount,Category
0,Brazil,2007,90075170,agriculture_pin
1,Brazil,2006,85319580,agriculture_pin
2,Brazil,2005,87363190,agriculture_pin
3,Brazil,2004,85626940,agriculture_pin
4,Brazil,2003,81198510,agriculture_pin
