# Machine Learning Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import plotly.figure_factory as ff
import plotly.express as px

In [None]:
data = pd.read_csv("crop_dataset.csv")
data.head(10)

In [None]:
label_encoder = LabelEncoder()
data['Soilcolor'] = label_encoder.fit_transform(data['Soilcolor'])
data['label'] = label_encoder.fit_transform(data['label'])

In [None]:
selected_features = [
    'Soilcolor', 'Ph', 'K', 'P', 'N', 'Zn', 'S', 
    'QV2M-W', 'QV2M-Sp', 'QV2M-Su', 'QV2M-Au', 
    'T2M_MAX-W', 'T2M_MAX-Sp', 'T2M_MAX-Su', 
    'T2M_MAX-Au', 'T2M_MIN-W', 'T2M_MIN-Sp', 
    'T2M_MIN-Su', 'T2M_MIN-Au', 
    'PRECTOTCORR-W', 'PRECTOTCORR-Sp', 
    'PRECTOTCORR-Su', 'PRECTOTCORR-Au', 
    'WD10M', 'GWETTOP', 'CLOUD_AMT', 
    'WS2M_RANGE', 'PS'
]
X = data[selected_features]
y = data['label']

In [None]:
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=300, max_depth=50, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)  
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

report_rf = classification_report(y_test, y_pred_rf, target_names=label_encoder.inverse_transform(np.unique(y)), output_dict=True)

report_df_rf = pd.DataFrame(report_rf).transpose()
report_df_rf.reset_index(inplace=True)

numeric_cols = report_df_rf.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    report_df_rf[col] = report_df_rf[col].round(2)

fig_table = go.Figure(data=[go.Table(
    header=dict(
        values=['Metric', 'Precision', 'Recall', 'F1-score', 'Support'],
        fill_color='darkblue',
        font=dict(color='white', size=14),
        align='center'
    ),
    cells=dict(
        values=[report_df_rf[col] for col in report_df_rf.columns],
        fill_color=[['lightcyan', 'white'] * (len(report_df_rf) // 2)],
        font=dict(color='black', size=12),
        align='center'
    ))
])

fig_table.update_layout(title="Random Forest Classification Report", title_x=0.5, title_font=dict(size=20))
fig_table.show()

cm_rf = confusion_matrix(y_test, y_pred_rf)

labels = label_encoder.inverse_transform(np.unique(y))
cm_df = pd.DataFrame(cm_rf, index=labels, columns=labels)

fig_cm = go.Figure(data=go.Heatmap(
    z=cm_df.values,
    x=cm_df.columns,
    y=cm_df.index,
    colorscale="rainbow",  
    text=cm_df.values,
    texttemplate="%{text}",
    hoverinfo="text"
))

fig_cm.update_layout(
    title="Confusion Matrix (Random Forest)",
    xaxis=dict(title="Predicted Labels"),
    yaxis=dict(title="True Labels"),
    width=800,
    height=600
)

fig_cm.show()

n_classes = len(np.unique(y))
fig_roc = go.Figure()

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test == i, y_prob_rf[:, i])
    roc_auc = auc(fpr, tpr)
    
    fig_roc.add_trace(go.Scatter(
        x=fpr, y=tpr,
        mode='lines',
        name=f'Class {labels[i]} (AUC = {roc_auc:.2f})',
        hoverinfo='x+y+name',
    ))

fig_roc.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines',
    line=dict(dash='dash', color='black'),
    name='Random Guess'
))

fig_roc.update_layout(
    title="ROC-AUC Curve (Random Forest)",
    xaxis=dict(title="False Positive Rate"),
    yaxis=dict(title="True Positive Rate"),
    width=900,
    height=600,
    legend_title="Classes"
)

fig_roc.show()
