In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [5]:
df = pd.read_csv("Crop_Data.csv")
df["Country"].value_counts()

Nigeria         713
South Africa    468
Kenya           155
Sudan            64
Name: Country, dtype: int64

In [3]:
##  Encoding Categorical Variables:

In [4]:
label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])

In [5]:
# Shuffle the DataFrame rows
shuffled_df = df.sample(frac=1, random_state=42)
shuffled_df

Unnamed: 0,temperature,humidity,ph,water availability,season,label,Country,harvest season
665,27.106068,89.895933,6.698574,37.456806,rainy,mungbean,1,summer
624,28.951724,81.670853,6.510841,56.511033,rainy,mungbean,1,spring
115,18.254054,55.282204,6.204748,63.723582,rainy,maize,2,winter
478,29.490967,67.106044,6.471862,153.250451,rainy,pigeonpeas,2,winter
233,17.848517,19.091729,8.621663,76.324707,winter,chickpea,1,spring
...,...,...,...,...,...,...,...,...
1095,25.287846,89.636679,6.765095,58.286977,summer,watermelon,1,rainy
1130,29.727911,94.297533,6.367801,26.523641,summer,muskmelon,0,rainy
1294,23.438217,78.633888,6.200672,81.150721,winter,cotton,1,rainy
860,23.970814,62.355576,7.007038,53.409060,rainy,lentil,1,spring


In [6]:
# Scale numerical columns (temperature, humidity, pH, water availability)
scaler = MinMaxScaler()
columns_to_scale = ['temperature', 'humidity', 'ph', 'water availability']
shuffled_df[columns_to_scale] = scaler.fit_transform(shuffled_df[columns_to_scale])

In [7]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = shuffled_df.drop(['label', 'season', 'harvest season'], axis=1)  # Features
y_label = shuffled_df['label']  # Target variable: label
y_season = shuffled_df['season']  # Target variable: season
y_harvest_season = shuffled_df['harvest season']  # Target variable: harvest season

# Split the data into training and testing sets for features and target variables
X_train, X_test, y_train_label, y_test_label = train_test_split(X, y_label, test_size=0.2, random_state=42)
X_train, X_test, y_train_season, y_test_season = train_test_split(X, y_season, test_size=0.2, random_state=42)
X_train, X_test, y_train_harvest, y_test_harvest = train_test_split(X, y_harvest_season, test_size=0.2, random_state=42)

# Define individual classifiers for each target
rf_label = RandomForestClassifier()
rf_season = RandomForestClassifier()
rf_harvest = RandomForestClassifier()

# Create MultiOutputClassifier
multi_rf_classifier = MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1)

# Fit the multi-output classifier on all target variables simultaneously
multi_rf_classifier.fit(X_train, pd.DataFrame({'label': y_train_label, 'season': y_train_season, 'harvest_season': y_train_harvest}))

# Predict all target variables simultaneously for test data
predictions = multi_rf_classifier.predict(X_test)

# Access individual predictions for each target
predictions_label = predictions[:, 0]  # label prediction
predictions_season = predictions[:, 1]  # season prediction
predictions_harvest = predictions[:, 2]  # harvest season prediction

# Evaluate the predictions
accuracy_label = accuracy_score(y_test_label, predictions_label)
accuracy_season = accuracy_score(y_test_season, predictions_season)
accuracy_harvest = accuracy_score(y_test_harvest, predictions_harvest)

# Print accuracy for each target variable
print(f"Accuracy for label: {accuracy_label}")
print(f"Accuracy for season: {accuracy_season}")
print(f"Accuracy for harvest_season: {accuracy_harvest}")


Accuracy for label: 0.9857142857142858
Accuracy for season: 0.8321428571428572
Accuracy for harvest_season: 0.26071428571428573


In [8]:
from joblib import dump

dump(multi_rf_classifier, 'final_model.pkl')
dump(label_encoder, 'label_encoder.pkl')
dump(scaler, 'minmax_scaler.pkl')

['minmax_scaler.pkl']

In [9]:
from joblib import load

# Load the model from the file
# loaded_model = load('final_model.pkl')
# label_encoder = load('label_encoder.pkl')
# scaler = load('minmax_scaler.pkl')

### Preprocess New Data:
Before using your trained model to make predictions in production, preprocess the new data using the loaded preprocessing objects:

In [10]:
# # Assuming 'new_data' contains the new input data
# # Apply the same transformations as done in training
# new_data[columns_to_scale] = scaler.transform(new_data[columns_to_scale])
# new_data['encoded_categorical_column'] = label_encoder.transform(new_data['categorical_column'])

# # Now, use your model to make predictions on this preprocessed data
# predictions = multi_rf_classifier.predict(new_data)
