In [15]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import statistics as st

import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
import statsmodels.stats.api as sms

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

%matplotlib inline

In [2]:
def clean_raw_df_production():
    df = pd.read_csv('Data/fao_data_production_indices_data.csv')
    headers = ["Region", "Element Code", "Production", "Year", "Unit", "Dollar Amount", "Value Footnotes", "Category"]
    df.columns = headers
    
    year_med = st.median(df['Year'])
    values = {'Year': year_med, 'Production': 'Gross', 'Unit': 'Int. $'}
    df.fillna(value=values, inplace=True)
    df.dropna(subset=['Dollar Amount'], inplace=True)
    df = df[df['Dollar Amount'] != 0]
    
    df[["Year", "Dollar Amount"]] = df[["Year", "Dollar Amount"]].astype("int")
    df.reset_index(inplace=True)
    
    df.drop(columns=['index', 'Value Footnotes'], inplace=True)

    df.replace({'Gross Production 1999-2001 (1000 I$)':'Gross Production',
                'Net Production 1999-2001 (1000 I$)' : 'Net Production', 
                'Gross PIN (base 1999-2001)' : 'Gross PIN', 
                'Grs per capita PIN (base 1999-2001)':'Gross Per Capita PIN', 
                'Net PIN (base 1999-2001)':'Net PIN', 
                'Net per capita PIN (base 1999-2001)':'Net Per Capita PIN'}, inplace=True)
    
    binary_col = []
    for val in df['Production']:
        if 'Net' in val:
            binary_col.append(0)
        elif 'Gross' in val:
            binary_col.append(1)
        else:
            binary_col.append(-1)
        
    df['Gross/Net Binary'] = binary_col      
                     
    return df

In [3]:
df = clean_raw_df_production()
df.head()

Unnamed: 0,Region,Element Code,Production,Year,Unit,Dollar Amount,Category,Gross/Net Binary
0,Afghanistan,152,Gross Production,2007,1000 Int. $,2486910,agriculture_pin,1
1,Afghanistan,152,Gross Production,2006,1000 Int. $,2278516,agriculture_pin,1
2,Afghanistan,152,Gross Production,2005,1000 Int. $,2524097,agriculture_pin,1
3,Afghanistan,152,Gross Production,2004,1000 Int. $,2226346,agriculture_pin,1
4,Afghanistan,152,Gross Production,2003,1000 Int. $,2289434,agriculture_pin,1


In [4]:
gross_df = df.drop(df[df['Production'] != 'Gross Production'].index)
gross_df.head()

Unnamed: 0,Region,Element Code,Production,Year,Unit,Dollar Amount,Category,Gross/Net Binary
0,Afghanistan,152,Gross Production,2007,1000 Int. $,2486910,agriculture_pin,1
1,Afghanistan,152,Gross Production,2006,1000 Int. $,2278516,agriculture_pin,1
2,Afghanistan,152,Gross Production,2005,1000 Int. $,2524097,agriculture_pin,1
3,Afghanistan,152,Gross Production,2004,1000 Int. $,2226346,agriculture_pin,1
4,Afghanistan,152,Gross Production,2003,1000 Int. $,2289434,agriculture_pin,1


In [5]:
gross_agri_prod = gross_df[['Region', 'Year', 'Dollar Amount', 'Category']]
gross_agri_prod.sample(5)


Unnamed: 0,Region,Year,Dollar Amount,Category
190612,American Samoa,2003,385,livestock_pin
269781,Finland,1972,336,non_food_pin
178676,Spain,1993,19449850,food_pin
75920,Comoros,1984,21052,crops_pin
261567,Central America +,1998,1421420,non_food_pin


In [6]:
em_top_countries = ['China', 'India', 'United States of America', 'Brazil', 'Russian Federation']
fin_df = gross_agri_prod.loc[gross_agri_prod['Region'].apply(lambda x: x in em_top_countries)]
fin_df.reset_index(inplace=True)
fin_df.drop(columns='index', inplace=True)
fin_df.head()

Unnamed: 0,Region,Year,Dollar Amount,Category
0,Brazil,2007,90075170,agriculture_pin
1,Brazil,2006,85319580,agriculture_pin
2,Brazil,2005,87363190,agriculture_pin
3,Brazil,2004,85626940,agriculture_pin
4,Brazil,2003,81198510,agriculture_pin


# Train

In [19]:
df_onehot = fin_df.copy()
df_onehot = pd.get_dummies(df_onehot, columns=['Region', 'Category'], prefix = ['Reg', 'Cat'])
df_onehot.head()

Unnamed: 0,Year,Dollar Amount,Reg_Brazil,Reg_China,Reg_India,Reg_Russian Federation,Reg_United States of America,Cat_agriculture_pin,Cat_crops_pin,Cat_food_pin,Cat_livestock_pin,Cat_non_food_pin
0,2007,90075170,1,0,0,0,0,1,0,0,0,0
1,2006,85319580,1,0,0,0,0,1,0,0,0,0
2,2005,87363190,1,0,0,0,0,1,0,0,0,0
3,2004,85626940,1,0,0,0,0,1,0,0,0,0
4,2003,81198510,1,0,0,0,0,1,0,0,0,0


In [20]:
X = df_onehot.drop(columns='Dollar Amount')
y = df_onehot['Dollar Amount']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [22]:
ols = LinearRegression(fit_intercept=False, normalize=False)

In [23]:
res = ols.fit(X_train, y_train)
print(res)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)


In [25]:
ols.score(X_train, y_train)


0.7287770680157585

In [26]:
ols.score(X_test, y_test)


0.7048909663564513

In [27]:
x_sample = df_onehot.sample(n=1)
x_sample

Unnamed: 0,Year,Dollar Amount,Reg_Brazil,Reg_China,Reg_India,Reg_Russian Federation,Reg_United States of America,Cat_agriculture_pin,Cat_crops_pin,Cat_food_pin,Cat_livestock_pin,Cat_non_food_pin
904,1966,5521158,0,1,0,0,0,0,0,0,0,1


In [30]:
def predict_price(x_sample):

    cols = ['Year', 'Reg_Brazil', 'Reg_China', 'Reg_India',
            'Reg_Russian Federation', 'Reg_United States of America',
            'Cat_agriculture_pin', 'Cat_crops_pin', 'Cat_food_pin',
            'Cat_livestock_pin', 'Cat_non_food_pin']
    
    target = 'Dollar Amount'

    x_sample_final = x_sample[cols]
    return(ols.predict(x_sample_final)[0])

def accuracy(sample_price, predicted):
    return sample_price - predicted

In [31]:
predicted = predict_price(x_sample)
print(predicted)

sample_price = x_sample.values[0][0]
print(sample_price)

13470504.116652489
1966


In [32]:
round(accuracy(sample_price, predicted), 2)

-13468538.12