In [None]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

# Step 1: Data Preprocessing
# a) Read the uploaded dataset
file_path = '/content/Raisin_Dataset.xlsx'
df = pd.read_excel(file_path)

# b) Display the first 5 rows
print(df.head())

# c) Check for null values
print(df.isnull().sum())  # Check for missing values

# d) Convert the 'Class' column into binary (Kecimen as 0 and Besni as 1)
df['Class'] = df['Class'].map({'Kecimen': 0, 'Besni': 1})

# e) Feature importance using Chi-Square
X = df.drop('Class', axis=1)
y = df['Class']

# Normalize feature values for chi2 test
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Compute Chi-Square scores
chi_scores, p_values = chi2(X_scaled, y)
feature_importance = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi_scores, 'p-value': p_values})
print(feature_importance)

# Discard least important feature based on chi-square value
X = df.drop(columns=['MinorAxisLength'])  # Example, discard based on results

# Step 2: Split the dataset (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Decision Tree Classifier with default parameters
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 4: Evaluate the model on test data
y_pred = clf.predict(X_test)
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))

# Step 5: Train with different criteria (entropy and log_loss) and evaluate
# Using entropy
clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_entropy.fit(X_train, y_train)
y_pred_entropy = clf_entropy.predict(X_test)
print('Entropy - Accuracy:', accuracy_score(y_test, y_pred_entropy))

# Using log_loss
clf_log_loss = DecisionTreeClassifier(criterion='log_loss', random_state=42)
clf_log_loss.fit(X_train, y_train)
y_pred_log_loss = clf_log_loss.predict(X_test)
print('Log Loss - Accuracy:', accuracy_score(y_test, y_pred_log_loss))

# Step 6: Parameter tuning using GridSearchCV (corrected max_features values)
param_grid = {
    'max_depth': [10, 100],
    'min_samples_split': [4, 6, 8],
    'max_features': ['sqrt', 'log2', None]  # Removed 'auto'
}

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and model performance
print('Best Parameters:', grid_search.best_params_)
best_model = grid_search.best_estimator_

# Step 7: Evaluate the best model
y_pred_best = best_model.predict(X_test)
print('Best Model - Confusion Matrix:\n', confusion_matrix(y_test, y_pred_best))
print('Best Model - Accuracy:', accuracy_score(y_test, y_pred_best))
print('Best Model - Precision:', precision_score(y_test, y_pred_best))
print('Best Model - Recall:', recall_score(y_test, y_pred_best))


    Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0  87524       442.246011       253.291155      0.819738       90546   
1  75166       406.690687       243.032436      0.801805       78789   
2  90856       442.267048       266.328318      0.798354       93717   
3  45928       286.540559       208.760042      0.684989       47336   
4  79408       352.190770       290.827533      0.564011       81463   

     Extent  Perimeter    Class  
0  0.758651   1184.040  Kecimen  
1  0.684130   1121.786  Kecimen  
2  0.637613   1208.575  Kecimen  
3  0.699599    844.162  Kecimen  
4  0.792772   1073.251  Kecimen  
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64
           Feature  Chi2 Score       p-value
0             Area   40.913845  1.590901e-10
1  MajorAxisLength   34.625936  3.995473e-09
2  MinorAxisLength   14.726018  1.243189e-04
3 