# Grain Classification using CRISP-DM

## 1. Analysis and Preprocessing

In this section, we will load the dataset, analyze its structure, visualize distributions and relationships, and preprocess the data for modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

### 1.1 Load Dataset

In [None]:
# Define column names based on dataset description
columns = [
    'Area',
    'Perimeter',
    'Compactness',
    'Kernel_Length',
    'Kernel_Width',
    'Asymmetry_Coeff',
    'Kernel_Groove_Length',
    'Class'
]

# Load the dataset
df = pd.read_csv('seeds_dataset.txt', sep='\t+', header=None, names=columns, engine='python')

# Display first rows
df.head()

### 1.2 Descriptive Statistics

In [None]:
df.describe()

In [None]:
df.info()

### 1.3 Data Visualization

In [None]:
# Histograms
df.hist(figsize=(12, 10), bins=20)
plt.suptitle('Feature Distributions', fontsize=16)
plt.show()

In [None]:
# Boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.columns[:-1]):
    plt.subplot(3, 3, i+1)
    sns.boxplot(x='Class', y=col, data=df)
    plt.title(f'{col} by Class')
plt.tight_layout()
plt.show()

In [None]:
# Pairplot to see relationships
sns.pairplot(df, hue='Class', palette='viridis')
plt.show()

### 1.4 Missing Values

In [None]:
df.isnull().sum()

### 1.5 Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled['Class'] = y

df_scaled.head()