In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [34]:
# read data, remove unused column, drop rows with missing values
data = pd.read_csv("data/Crop_recommendation.csv")
data.dropna(inplace=True)
data

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,coffee
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,coffee
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,coffee
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,coffee


In [35]:
def factorize_objs(df: pd.DataFrame):
    for colname, dtype in zip(df.keys(), df.dtypes.to_list()):
        if dtype == np.dtypes.ObjectDType:
            vals, keys = pd.factorize(df[colname])
            df[colname] = vals
            print(keys)

factorize_objs(data)
data

Index(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas', 'mothbeans',
       'mungbean', 'blackgram', 'lentil', 'pomegranate', 'banana', 'mango',
       'grapes', 'watermelon', 'muskmelon', 'apple', 'orange', 'papaya',
       'coconut', 'cotton', 'jute', 'coffee'],
      dtype='object')


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,0
1,85,58,41,21.770462,80.319644,7.038096,226.655537,0
2,60,55,44,23.004459,82.320763,7.840207,263.964248,0
3,74,35,40,26.491096,80.158363,6.980401,242.864034,0
4,78,42,42,20.130175,81.604873,7.628473,262.717340,0
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,21
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,21
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,21
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,21


In [36]:
# scale rainf

In [37]:
from sklearn.preprocessing import StandardScaler
target = "label"
features = data.keys().to_list()
features.remove("label")

X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], train_size=0.8, random_state=42)

scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, hidden_layer_sizes=(16, 64, 32), max_iter=1000)

model.fit(X_train, y_train)

In [40]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

# print(classification_report(y_test, y_pred))
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df.to_latex())

\begin{tabular}{lrrrr}
\toprule
 & precision & recall & f1-score & support \\
\midrule
0 & 1.000000 & 0.842105 & 0.914286 & 19.000000 \\
1 & 1.000000 & 1.000000 & 1.000000 & 21.000000 \\
2 & 1.000000 & 1.000000 & 1.000000 & 26.000000 \\
3 & 0.909091 & 1.000000 & 0.952381 & 20.000000 \\
4 & 1.000000 & 0.913043 & 0.954545 & 23.000000 \\
5 & 1.000000 & 0.875000 & 0.933333 & 24.000000 \\
6 & 1.000000 & 1.000000 & 1.000000 & 19.000000 \\
7 & 1.000000 & 0.950000 & 0.974359 & 20.000000 \\
8 & 0.733333 & 1.000000 & 0.846154 & 11.000000 \\
9 & 1.000000 & 1.000000 & 1.000000 & 23.000000 \\
10 & 1.000000 & 1.000000 & 1.000000 & 21.000000 \\
11 & 1.000000 & 1.000000 & 1.000000 & 19.000000 \\
12 & 1.000000 & 1.000000 & 1.000000 & 14.000000 \\
13 & 1.000000 & 1.000000 & 1.000000 & 19.000000 \\
14 & 1.000000 & 1.000000 & 1.000000 & 17.000000 \\
15 & 1.000000 & 1.000000 & 1.000000 & 23.000000 \\
16 & 1.000000 & 1.000000 & 1.000000 & 14.000000 \\
17 & 1.000000 & 1.000000 & 1.000000 & 23.000000 \\
18 & 

In [None]:
# import dill
# dill.settings["recurse"] = True

# test = X_test.join(y_test)
# data.to_csv("clean_data/crop_test.csv", index=False)

# with open("models/crop_mlp.modelfile", 'wb') as f:
#     dill.dump(model, f)
