#### Build multi-class classification models to predict the type of `"crop"` and identify the single most importance feature for predictive performance.

- Find the feature in the dataset that produces the best score for predicting 'crop".
- From this information, create a variable called best_predictive_feature, which
should be a dictionary containing the best predictive feature name as a key and the evaluation score (for the metric you chose) as the value.

In [1]:
# All required libraries are imported here for you.
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
import seaborn as sns
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

# Configure
random_seed = 1008

# Define a dict for feature's meanings
feature_meaning = {
'N': 'Nitrogen content ratio in the soil',
'P': 'Phosphorous content ratio in the soil',
'K': 'Potassium content ratio in the soil',
'pH': 'pH value of the soil', 
}

# Load the dataset
crops = pd.read_csv('data/soil_measures.csv')
print(crops.head())


    N   P   K        ph  crop
0  90  42  43  6.502985  rice
1  85  58  41  7.038096  rice
2  60  55  44  7.840207  rice
3  74  35  40  6.980401  rice
4  78  42  42  7.628473  rice


In [2]:
# Inspect
print(crops.info())
print(crops.describe())
print(crops['crop'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       2200 non-null   int64  
 1   P       2200 non-null   int64  
 2   K       2200 non-null   int64  
 3   ph      2200 non-null   float64
 4   crop    2200 non-null   object 
dtypes: float64(1), int64(3), object(1)
memory usage: 86.1+ KB
None
                 N            P            K           ph
count  2200.000000  2200.000000  2200.000000  2200.000000
mean     50.551818    53.362727    48.149091     6.469480
std      36.917334    32.985883    50.647931     0.773938
min       0.000000     5.000000     5.000000     3.504752
25%      21.000000    28.000000    20.000000     5.971693
50%      37.000000    51.000000    32.000000     6.425045
75%      84.250000    68.000000    49.000000     6.923643
max     140.000000   145.000000   205.000000     9.935091
crop
rice           100
maize          100
ju

In [3]:
# Split data
X = crops.drop(columns='crop')
y = crops['crop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=random_seed)

# Scale
scaler = StandardScaler()

In [4]:
# Instanciate estimator
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000000, random_state=random_seed)

# Instanciate pipeline
pipe = make_pipeline(scaler, logreg)

# Train model
pipe.fit(X_train, y_train);
# Predict
y_pred = pipe.predict(X_test)
# Score
score = accuracy_score(y_test, y_pred)

# Find the larges coef
model = pipe.named_steps['logisticregression']
avg_coefs = np.mean(np.abs(model.coef_), axis=0)
best_coef_idx = list(avg_coefs).index(max(avg_coefs))
best_feature = crops.columns[best_coef_idx]
best_predictive_feature = {best_feature:score}

# Print best preditive feature and score
print(f'The best predictive feature is "{best_feature}, {feature_meaning[best_feature]}" \
with a score of {score:.4f}')

The best predictive feature is "K, Potassium content ratio in the soil" with a score of 0.6364
