# Loading the data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import category_encoders as ce

In [2]:
df = pd.read_excel('files/food-twentieth-century-crop-statistics-1900-2017-xlsx.xlsx', sheet_name="CropStats")

df_transformed=df.drop(['Unnamed: 0','admin2','notes'], axis=1)
df_transformed['admin1'].fillna(df['admin0'], inplace=True)

for index, row in df_transformed.iterrows():
    if pd.notna(row['hectares (ha)']) and pd.notna(row['production (tonnes)']) and pd.isna(row['yield(tonnes/ha)']) and row['hectares (ha)'] != 0:
        df_transformed.at[index, 'yield(tonnes/ha)'] = row['production (tonnes)'] / row['hectares (ha)']

df_transformed['yield(tonnes/ha)'].bfill(inplace=True)
df_transformed=df_transformed.drop(['hectares (ha)','production (tonnes)'], axis=1)
df_transformed

Unnamed: 0,Harvest_year,admin0,admin1,crop,year,yield(tonnes/ha)
0,1902,Austria,Austria,wheat,1902,1.310000
1,1903,Austria,Austria,wheat,1903,1.470000
2,1904,Austria,Austria,wheat,1904,1.270000
3,1905,Austria,Austria,wheat,1905,1.330000
4,1906,Austria,Austria,wheat,1906,1.280000
...,...,...,...,...,...,...
36702,2013,China,zhejiang,wheat,2013,3.685117
36703,2014,China,zhejiang,wheat,2014,3.768875
36704,2015,China,zhejiang,wheat,2015,3.912027
36705,2016,China,zhejiang,wheat,2016,3.315054


# AWS Model

First we are going to import all the libraries we will need.

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sagemaker.image_uris import retrieve
import sagemaker
from sagemaker import get_execution_role
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
import numpy as np
import pandas as pd
import io
import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


This is the dataframe we are going to use to make our model.

In [30]:
df_transformed

Unnamed: 0.1,Unnamed: 0,Harvest_year,admin0,admin1,crop,year,yield(tonnes/ha)
0,0,1902,Austria,Austria,wheat,1902,1.310000
1,1,1903,Austria,Austria,wheat,1903,1.470000
2,2,1904,Austria,Austria,wheat,1904,1.270000
3,3,1905,Austria,Austria,wheat,1905,1.330000
4,4,1906,Austria,Austria,wheat,1906,1.280000
...,...,...,...,...,...,...,...
36702,36702,2013,China,zhejiang,wheat,2013,3.685117
36703,36703,2014,China,zhejiang,wheat,2014,3.768875
36704,36704,2015,China,zhejiang,wheat,2015,3.912027
36705,36705,2016,China,zhejiang,wheat,2016,3.315054


Xgb needs the value we want to predict in the first slot. So we have to move our yield from last to the first column.

In [31]:
cols = df_transformed.columns.tolist()
cols = cols[-1:]+cols[:-1]
df_model = df_transformed[cols]
df_model

Unnamed: 0.1,yield(tonnes/ha),Unnamed: 0,Harvest_year,admin0,admin1,crop,year
0,1.310000,0,1902,Austria,Austria,wheat,1902
1,1.470000,1,1903,Austria,Austria,wheat,1903
2,1.270000,2,1904,Austria,Austria,wheat,1904
3,1.330000,3,1905,Austria,Austria,wheat,1905
4,1.280000,4,1906,Austria,Austria,wheat,1906
...,...,...,...,...,...,...,...
36702,3.685117,36702,2013,China,zhejiang,wheat,2013
36703,3.768875,36703,2014,China,zhejiang,wheat,2014
36704,3.912027,36704,2015,China,zhejiang,wheat,2015
36705,3.315054,36705,2016,China,zhejiang,wheat,2016


Here we can also see the order of the columns.

In [32]:
df_model.columns

Index(['yield(tonnes/ha)', 'Unnamed: 0', 'Harvest_year', 'admin0', 'admin1',
       'crop', 'year'],
      dtype='object')

We split our dataset into a training set, a validation set and a test set.

In [33]:
train, test_and_validate = train_test_split(df_model, test_size=0.2, random_state=42, stratify=df_model['Harvest_year'])

In [34]:
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42)

Now we will print the shape of all the sets. As you can see all the sets have the same amount of columns.

In [35]:
print(train.shape)
print(test.shape)
print(validate.shape)

(29365, 7)
(3671, 7)
(3671, 7)


Now we will create a S3 bucket and upload our data to that bucket. I do this because we need our data inside a bucket to make our model.

In [None]:
s3 = boto3.client('s3')
bucket='cropdata-bucket'

prefix='datawizards'

train_file='train.csv'
test_file='test.csv'
validate_file='validate.csv'

s3_resource = boto3.Session().resource('s3')

s3.create_bucket(Bucket=bucket)

def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False)
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

We upload our train, validate and test set to our bucket using the definition we created before.

In [None]:
upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

Now it is time to train our model. We retrieve the xgboost inside the container.

In [None]:
container = retrieve('xgboost', boto3.Session().region_name,'1.0-1')

We enter the hyperparameters so we can create a regression model.

In [None]:
hyperparams={
    "num_round":"42",
    "eval_metric":"rmse,mae",
    "objective": "reg:squarederror"
}

Now we can make the model using the container and hyperparameter we have created. we also have to define the path towards our S3 bucket.

In [None]:
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                        sagemaker.get_execution_role(),
                                        instance_count=1,
                                        instance_type='ml.m4.xlarge',
                                        output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session()
                                       )

Then we define the training data and the validation data.

In [None]:
train_channel = sagemaker.inputs.TrainingInput("s3://{}/{}/train/".format(bucket,prefix,train_file),
                                               content_type='text/csv'
                                              )

validate_channel = sagemaker.inputs.TrainingInput("s3://{}/{}/validate/".format(bucket,prefix,validate_file),
                                               content_type='text/csv'
                                              )

data_channels = {'train': train_channel, 'validation': validate_channel}

And after all these steps we can train our model by using the training and validation data we just defined.

In [None]:
xgb_model.fit(inputs=data_channels,logs=False)

Now that the model has been created we can deploy it.

In [None]:
xgb_predictor = xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    serializer=sagemaker.serializers.CSVSerializer(),
    deserializer=sagemaker.deserializers.CSVDeserializer()
)

Once the model has been deployed we can use our test data to test how accurate the model actually is.

In [None]:
test_data_bytes = test.to_csv(index=False, header=False).encode('utf-8')
predictions = xgb_predictor.predict(test_data_bytes)

actual_values = test['yield(tonnes/ha)']
rmse = np.sqrt(mean_squared_error(actual_values, predictions))
print("Root Mean Squared Error:", rmse)