# Grain Classification using CRISP-DM

## 1. Analysis and Preprocessing

In this section, we will load the dataset, analyze its structure, visualize distributions and relationships, and preprocess the data for modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

### 1.1 Load Dataset

In [None]:
# Define column names based on dataset description
columns = [
    'Area',
    'Perimeter',
    'Compactness',
    'Kernel_Length',
    'Kernel_Width',
    'Asymmetry_Coeff',
    'Kernel_Groove_Length',
    'Class'
]

# Load the dataset
df = pd.read_csv('seeds_dataset.txt', sep='\t+', header=None, names=columns, engine='python')

# Display first rows
df.head()

### 1.2 Descriptive Statistics

In [None]:
df.describe()

In [None]:
df.info()

### 1.3 Data Visualization

In [None]:
# Histograms
df.hist(figsize=(12, 10), bins=20)
plt.suptitle('Feature Distributions', fontsize=16)
plt.show()

In [None]:
# Boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.columns[:-1]):
    plt.subplot(3, 3, i+1)
    sns.boxplot(x='Class', y=col, data=df)
    plt.title(f'{col} by Class')
plt.tight_layout()
plt.show()

In [None]:
# Pairplot to see relationships
sns.pairplot(df, hue='Class', palette='viridis')
plt.show()

### 1.4 Missing Values

In [None]:
df.isnull().sum()

### 1.5 Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled['Class'] = y

df_scaled.head()

## 2. Model Implementation and Comparison

We will implement and compare the following algorithms:
- K-Nearest Neighbors (KNN)
- Support Vector Machine (SVM)
- Random Forest
- Naive Bayes
- Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize models
models = {
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42)
}

# Train and evaluate
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

In [None]:
# Compare performance
plt.figure(figsize=(10, 6))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0.8, 1.0)
plt.show()

## 3. Model Optimization

We will use Grid Search to optimize the hyperparameters of the best performing models (likely SVM and Random Forest).

In [None]:
from sklearn.model_selection import GridSearchCV

# SVM Optimization
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

grid_svm = GridSearchCV(SVC(), param_grid_svm, refit=True, verbose=2)
grid_svm.fit(X_train, y_train)

print("Best SVM Parameters:", grid_svm.best_params_)
print("Best SVM Score:", grid_svm.best_score_)

In [None]:
# Random Forest Optimization
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, refit=True, verbose=2)
grid_rf.fit(X_train, y_train)

print("Best Random Forest Parameters:", grid_rf.best_params_)
print("Best Random Forest Score:", grid_rf.best_score_)

In [None]:
# Evaluate Optimized Models
print("--- Optimized SVM ---")
y_pred_svm = grid_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

print("--- Optimized Random Forest ---")
y_pred_rf = grid_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))