In [20]:
import warnings
warnings.filterwarnings("ignore")

from xai_agg import *

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor

import pandas as pd
import numpy as np

import dill

In [2]:
raw_data = pd.read_csv('../data/crop_yield.csv')

display(raw_data.head())

# Print dataframe info
display(raw_data.info())

# List categorical values
categorical_features = raw_data.select_dtypes(include=['object']).columns
for column in categorical_features:
    display(raw_data[column].value_counts())

Unnamed: 0,District_Name,Season,Area,Production,Crop
0,NORTH AND MIDDLE ANDAMAN,Rabi,294.5,90.8,Tur
1,SOUTH ANDAMANS,Rabi,20.5,13.2,Tur
2,ANANTAPUR,Kharif,21400.0,2600.0,Tur
3,ANANTAPUR,Kharif,27400.0,9100.0,Tur
4,ANANTAPUR,Kharif,30693.0,7888.0,Tur


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124969 entries, 0 to 124968
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   District_Name  124969 non-null  object 
 1   Season         124969 non-null  object 
 2   Area           124969 non-null  float64
 3   Production     123443 non-null  float64
 4   Crop           124969 non-null  object 
dtypes: float64(2), object(3)
memory usage: 4.8+ MB


None

District_Name
AURANGABAD     541
BELLARY        521
BIJAPUR        504
CHITRADURGA    482
TUMKUR         482
              ... 
LONGDING         4
RAMGARH          2
KHUNTI           2
SHOPIAN          2
MUMBAI           1
Name: count, Length: 645, dtype: int64

Season
Kharif         69366
Rabi           21347
Summer         13141
Whole Year     11648
Autumn          4743
Winter          4724
Name: count, dtype: int64

Crop
Rice                 15104
Maize                13947
Moong(Green Gram)    10318
Urad                  9850
Sesamum               9046
Groundnut             8834
Sugarcane             7921
Wheat                 7899
Tur                   7578
Jowar                 7065
Sunflower             5571
Bajra                 5427
Cotton(lint)          4518
Ragi                  4145
Soyabean              3212
Niger seed            2070
Coconut               1985
Paddy                  479
Name: count, dtype: int64

In [11]:
raw_data.loc[23145]

District_Name     AURANGABAD
Season           Summer     
Area                   100.0
Production             100.0
Crop               Groundnut
Name: 23145, dtype: object

In [36]:
from sklearn.feature_extraction import FeatureHasher

preprocessed_data = raw_data.copy()

# Remove trailing whitespaces from 'Season' column, and one hot encode it
preprocessed_data['Season'] = preprocessed_data['Season'].str.strip()
season_dummies = pd.get_dummies(preprocessed_data['Season'], prefix='Season', drop_first=False, dtype=int)
preprocessed_data = pd.concat([preprocessed_data.drop(columns=['Season']), season_dummies], axis=1)

hashed_column_count = 10
hasher = FeatureHasher(input_type='string', n_features=hashed_column_count)
hashed_features = hasher.transform(preprocessed_data['District_Name'].apply(lambda x: [x]))
hashed_features_df = pd.DataFrame(hashed_features.toarray(), columns=[f'DistrictNameHash_{i}' for i in range(hashed_column_count)])
preprocessed_data = pd.concat([preprocessed_data.drop(columns=['District_Name']), hashed_features_df], axis=1)

# One-Hot-Encode the 'Crop' column
crop_dummies = pd.get_dummies(preprocessed_data['Crop'], prefix='Crop', drop_first=False, dtype=int)
preprocessed_data = pd.concat([preprocessed_data.drop(columns=['Crop']), crop_dummies], axis=1)

preprocessed_data, _ = train_test_split(preprocessed_data, test_size=0.97, random_state=42)

# Remove any row with missing values or Nans
preprocessed_data = preprocessed_data.dropna()

display(preprocessed_data)

Unnamed: 0,Area,Production,Season_Autumn,Season_Kharif,Season_Rabi,Season_Summer,Season_Whole Year,Season_Winter,DistrictNameHash_0,DistrictNameHash_1,...,Crop_Paddy,Crop_Ragi,Crop_Rice,Crop_Sesamum,Crop_Soyabean,Crop_Sugarcane,Crop_Sunflower,Crop_Tur,Crop_Urad,Crop_Wheat
102304,18.0,20.0,0,1,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
17073,6.0,5.0,0,1,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
47299,52093.0,68138.0,0,1,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
15892,15600.0,43700.0,0,1,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
123676,64236.0,203628.0,0,0,1,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,23969.0,37181.0,0,0,1,0,0,0,0.0,-1.0,...,0,0,0,0,0,0,0,0,0,1
103694,4530.0,2870.0,0,0,0,1,0,0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
860,667.0,350.0,0,1,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
15795,215300.0,236800.0,0,1,0,0,0,0,-1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
X = preprocessed_data.drop(columns=['Production'])
y = preprocessed_data['Production']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Fit a random forest regressor
clf = RandomForestRegressor(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate error rates
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R^2: {r2}')


MSE: 6995678835542.46
RMSE: 2644934.561674912
MAE: 169830.82649623282
R^2: 0.8763898456532766


In [42]:
from xai_agg.utils import evaluate_aggregate_explainer

results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,                                         # Model and data
    explainer_components_sets=[[LimeWrapper, ShapTabularTreeWrapper, AnchorWrapper]],   # Wrapped explainer sets to be tested
    mcdm_algs=[pymcdm.methods.TOPSIS()],                                                # MCDM algorithms to be tested
    aggregation_algs=["wsum"],                                                          # Aggregation algorithms to be tested
    metrics_sets=[['nrc', 'sensitivity_spearman', 'faithfulness_corr']],                # Metric sets to be tested
    n_instances=1,                                                                      # Number of instances per setting to run the evaluation on
)

AttributeError: 'RandomForestRegressor' object has no attribute 'predict_proba'