# **Crop Prediction Using Random Forests**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
data = pd.read_csv('AgrcultureDataset.csv')
data = data.head(220000) # for limiting number of rows
data['Production'] = pd.to_numeric(data['Production'], errors='coerce')
# Engineer new feature
seasonal_data = data.groupby(['Crop', 'Season'], as_index=False).agg({'Area': 'mean', 'Production': 'mean'})
seasonal_data.columns = ['Crop', 'Season', 'MeanArea', 'MeanProduction']
data = pd.merge(data, seasonal_data, on=['Crop', 'Season'], how='left')
data['SeasonalAreaRatio'] = data['Area'] / data['MeanArea']
data['SeasonalProductionRatio'] = data['Production'] / data['MeanProduction']
data = data.dropna(subset=['Production'])
data = pd.get_dummies(data, columns=['District_Name', 'Season', 'Crop'])
# Update feature matrix
X = data.drop(['Production','State_Name','Crop_Year'], axis=1)
y = data['Production']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = y_train.fillna(y_train.mode()[0])
X_train = X_train.fillna(X_train.mean())
clf = RandomForestRegressor(n_estimators=50, min_samples_split=10, min_samples_leaf=5)
clf.fit(X_train, y_train)
# Update input data construction
input_location = 'VELLORE'
input_area = 560
input_season = 'Whole Year'
district_name_col = f'District_Name_{input_location}'
if district_name_col not in X.columns:
    raise ValueError(f'Invalid district name: {input_location}')
season_col = f'Season_{input_season}'
crop_columns = [col for col in X.columns if col.startswith('Crop_')]
#print(crop_columns)
input_data = []
for crop in crop_columns:
    crop_name = crop.split('_')[1]
    seasonal_data_row = seasonal_data[(seasonal_data['Crop'] == crop_name) & (seasonal_data['Season'] == input_season)]
    if not seasonal_data_row.empty:
        mean_area_value = seasonal_data_row['MeanArea'].values[0]
        mean_production_value = seasonal_data_row['MeanProduction'].values[0]
        seasonal_area_ratio_value = input_area / mean_area_value
    else:
        groupby_columns = crop_columns + ['Area', 'Production']
        crop_data = data.groupby(groupby_columns, as_index=False).agg({'Area': 'mean', 'Production': 'mean'})
        mean_area_value = crop_data[crop_data[crop] == 1]['Area'].values[0]
        mean_production_value = crop_data[crop_data[crop] == 1]['Production'].values[0]
        seasonal_area_ratio_value = input_area / mean_area_value
    row = {district_name_col: 1, 'Area': input_area, season_col: 1, 'SeasonalAreaRatio': seasonal_area_ratio_value}
    for c in crop_columns:
        row[c] = 1 if c == crop else 0
    input_data.append(row)
input_data = pd.DataFrame(input_data).reindex(columns=X.columns, fill_value=0)
input_data = input_data.fillna(input_data.mean())
predicted_production = clf.predict(input_data)
recommended_crop_index = predicted_production.argmax()
recommended_crop = crop_columns[recommended_crop_index].split('_')[1]
train_accuracy = clf.score(X_train, y_train)
print(f'Training accuracy: {train_accuracy:.2f}')
print(f'The recommended crop for location: {input_location}, area: {input_area}, and season: {input_season} is {recommended_crop}.')

In [None]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
# Load the saved model
with open('finalized_model.sav', 'rb') as file:
    model = pickle.load(file)

input_location = 'NAGAON'
input_area = 6000
input_season = 'Whole Year'
district_name_col = f'District_Name_{input_location}'
if district_name_col not in X.columns:
    raise ValueError(f'Invalid district name: {input_location}')
season_col = f'Season_{input_season}'
crop_columns = [col for col in X.columns if col.startswith('Crop_')]
#print(crop_columns)
input_data = []
for crop in crop_columns:
    crop_name = crop.split('_')[1]
    seasonal_data_row = seasonal_data[(seasonal_data['Crop'] == crop_name) & (seasonal_data['Season'] == input_season)]
    if not seasonal_data_row.empty:
        mean_area_value = seasonal_data_row['MeanArea'].values[0]
        mean_production_value = seasonal_data_row['MeanProduction'].values[0]
        seasonal_area_ratio_value = input_area / mean_area_value
    else:
        groupby_columns = crop_columns + ['Area', 'Production']
        crop_data = data.groupby(groupby_columns, as_index=False).agg({'Area': 'mean', 'Production': 'mean'})
        mean_area_value = crop_data[crop_data[crop] == 1]['Area'].values[0]
        mean_production_value = crop_data[crop_data[crop] == 1]['Production'].values[0]
        seasonal_area_ratio_value = input_area / mean_area_value
    row = {district_name_col: 1, 'Area': input_area, season_col: 1, 'SeasonalAreaRatio': seasonal_area_ratio_value}
    for c in crop_columns:
        row[c] = 1 if c == crop else 0
    input_data.append(row)
input_data = pd.DataFrame(input_data).reindex(columns=X.columns, fill_value=0)
input_data = input_data.fillna(input_data.mean())
#print(input_data.head())

# Make predictions
predictions = model.predict(input_data)
recommended_crop_index = predictions.argmax()
recommended_crop = crop_columns[recommended_crop_index].split('_')[1]
print(f'The recommended crop for location: {input_location}, area: {input_area}, and season: {input_season} is {recommended_crop}.')