# Self chosen model CropData - Wouter Selis
For the crop dataset we found that the best models you can make to predict are regression models and decision tree models. For this assignement we chose to create a decision tree model. 

## Import libraries

In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

## Dataset transformation
Explanation of our dataset transformation is described in the EDA file.

In [2]:
df = pd.read_excel('files/food-twentieth-century-crop-statistics-1900-2017-xlsx.xlsx', sheet_name="CropStats")

df_transformed=df.drop(['Unnamed: 0','admin2','notes'], axis=1)
df_transformed['admin1'].fillna(df['admin0'], inplace=True)

for index, row in df_transformed.iterrows():
    if pd.notna(row['hectares (ha)']) and pd.notna(row['production (tonnes)']) and pd.isna(row['yield(tonnes/ha)']) and row['hectares (ha)'] != 0:
        df_transformed.at[index, 'yield(tonnes/ha)'] = row['production (tonnes)'] / row['hectares (ha)']

df_transformed['yield(tonnes/ha)'].bfill(inplace=True)
df_transformed=df_transformed.drop(['hectares (ha)','production (tonnes)'], axis=1)
df_transformed

Unnamed: 0,Harvest_year,admin0,admin1,crop,year,yield(tonnes/ha)
0,1902,Austria,Austria,wheat,1902,1.310000
1,1903,Austria,Austria,wheat,1903,1.470000
2,1904,Austria,Austria,wheat,1904,1.270000
3,1905,Austria,Austria,wheat,1905,1.330000
4,1906,Austria,Austria,wheat,1906,1.280000
...,...,...,...,...,...,...
36702,2013,China,zhejiang,wheat,2013,3.685117
36703,2014,China,zhejiang,wheat,2014,3.768875
36704,2015,China,zhejiang,wheat,2015,3.912027
36705,2016,China,zhejiang,wheat,2016,3.315054


## OneHotEncoder 
First I need to transform my dataset because the dataset contains string values and a decision tree model prefers to only using numerical values. Since they are not categorical values I will use one-hot encoding to transform these string values to numerical values.

In [3]:
# Create an instance of OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the data
encoded_data = encoder.fit_transform(df_transformed[['admin0', 'admin1', 'crop']])

# Convert the encoded data to a pandas DataFrame
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['admin0', 'admin1', 'crop']))

# Get the numerical columns to concatenate with the one-hot encoded df 
df_numerical = df_transformed.drop(['admin0', 'admin1', 'crop'], axis=1)

# Concatenate the encoded data with the original numerical data
final_df = pd.concat([encoded_df, df_numerical], axis=1)

# Print the encoded data
final_df

Unnamed: 0,admin0_Argentina,admin0_Australia,admin0_Austria,admin0_Belgium,admin0_Brazil,admin0_Canada,admin0_Chile,admin0_China,admin0_Croatia,admin0_Czech Republic,...,admin1_zhejiang,admin1_Île-de-France,crop_cereals,crop_maize,crop_spring wheat,crop_wheat,crop_winter wheat,Harvest_year,year,yield(tonnes/ha)
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1902,1902,1.310000
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1903,1903,1.470000
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1904,1904,1.270000
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1905,1905,1.330000
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1906,1906,1.280000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2013,2013,3.685117
36703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2014,2014,3.768875
36704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2015,2015,3.912027
36705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2016,2016,3.315054


## Selecting our features and target.
We want to predict the yield(tonnes/ha) so we drop them from our features list and set it as our target y.

In [4]:
X = final_df.drop('yield(tonnes/ha)', axis=1)
y = final_df['yield(tonnes/ha)']

## Split dataset
We split our data for 80% into a training set and for 20% into a test set.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Here we show the first 10 rows of our training feature set.

In [6]:
X_train.head(10)

Unnamed: 0,admin0_Argentina,admin0_Australia,admin0_Austria,admin0_Belgium,admin0_Brazil,admin0_Canada,admin0_Chile,admin0_China,admin0_Croatia,admin0_Czech Republic,...,admin1_yunnan,admin1_zhejiang,admin1_Île-de-France,crop_cereals,crop_maize,crop_spring wheat,crop_wheat,crop_winter wheat,Harvest_year,year
4282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1972,1972
29199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1915,1915
13630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1970,1970
25052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017,2017
10726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2014,2014
16233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1907,1907
28559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1931,1931
28209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1941,1941
18490,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2003,2003
13783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1966,1966


Here we show the first 10 rows of our training target set.

In [7]:
y_train.head(10)

4282      0.812162
29199     1.277778
13630     2.488305
25052     3.093568
10726    10.152201
16233     1.143275
28559     0.807018
28209     2.084796
18490     7.030123
13783     2.589182
Name: yield(tonnes/ha), dtype: float64

## Create Decision Tree model
We create our decision tree model.

In [8]:
model = DecisionTreeRegressor()

## Train the model
Here we train our model with our train dataset to prepare it for making predictions. 

In [9]:
model.fit(X_train, y_train)

## Test the model
Now we test our model using our test dataset.

In [10]:
# Maak voorspellingen op de testgegevens
y_pred = model.predict(X_test)

## Accuracy of model
As you can see we get an accuracy of +- 76% with our decision tree model.

In [11]:
# Convert to discrete values
y_pred_discrete = [int(round(x)) for x in y_pred]
y_test_discrete = [int(round(x)) for x in y_test]

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_discrete, y_pred_discrete)
print("Accuracy:", accuracy)

Accuracy: 0.7578316535004086


## We save the model in a folder
We can save a created model in a folder by using the pickle.dump() function. Now our model will be saved in the files folder so we can use it again later on.

In [12]:
import pickle

# Save the model to disk
filename = 'files/cropdata_chosen_model.sav'
pickle.dump(model, open(filename, 'wb'))

## Load model from folder
To load a model from a folder we can use the pickle.load() function. Now we can use the decision tree model that we made above.

In [13]:
# Load the model from disk
filename = 'files/cropdata_chosen_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model