# UCI Seed Classification
__UCI Seeds Data Set__
https://archive.ics.uci.edu/ml/datasets/seeds

__K-Nearest Neighbors Implementation__

In [54]:
#### 1. Import neccesary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import neighbors, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [55]:
### 2. Prepare data set (Headers added separately)
## Features:
# 1.) area: area
# 2.) perimeter: perimeter P
# 3.) compactness: C = 4*pi*A/P^2
# 4.) k_length: length of kernel
# 5.) k_width: width of kernel
# 6.) asymmetry: asymmetry coefficient
# 7.) g_length: length of kernel groove
## Label: class

features = ['area','perimeter', 'compactness', 'k_length', 'k_width', 'asymmetry', 'g_length', 'class']
df = pd.DataFrame(columns=features)

with open("data/seeds_dataset.txt") as file:
    for line in file:
        data = [x for x in line.split('\t') if x]  # Combine elements by tab
        data[-1] = data[-1].strip("\n")  # Remove break space at end of line
        data = [float(i) for i in data]  # Convert types to float
        row = dict(zip(features, data))  # Convert list to dictionary
        df = df.append(row, ignore_index=True)

print(df.head())

    area  perimeter  compactness  k_length  k_width  asymmetry  g_length  \
0  15.26      14.84       0.8710     5.763    3.312      2.221     5.220   
1  14.88      14.57       0.8811     5.554    3.333      1.018     4.956   
2  14.29      14.09       0.9050     5.291    3.337      2.699     4.825   
3  13.84      13.94       0.8955     5.324    3.379      2.259     4.805   
4  16.14      14.99       0.9034     5.658    3.562      1.355     5.175   

   class  
0    1.0  
1    1.0  
2    1.0  
3    1.0  
4    1.0  


In [56]:
### 3. Create training and testing data set

# Split X and Y values
X = df[features]
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # Split the data into training/testing data

# Normalize X data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [59]:
### 4. Create/Train/Predict model

model = neighbors.KNeighborsClassifier(n_jobs=5)  # Define classifier
model.fit(X_train, y_train)  # Train the model

y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_pred)

print(accuracy * 100)

100.0
