In [4]:
from typing import Tuple
import pandas as pd
import numpy as np
import random
from dataclasses import dataclass

In [2]:
seeds = pd.read_csv("seeds.csv", sep=",")

In [23]:
@dataclass
class seedsClassification:
    seedsMin: float
    seedsAvg: float
    seedsMax: float

class Classificator:
    def __init__(self):
        self.classification = {}

    def fit(self, df: pd.DataFrame) -> None:
        self.seedTypes = df["Type"].unique()
        for type in self.seedTypes:
            temp = df[df["Type"] == type]
            self.columns = df.columns[:-1]
            for i in self.columns:
                seedsObject = seedsClassification(temp[i].min(), temp[i].mean(), temp[i].max())
                if len(df[df[i]<seedsObject.seedsAvg]) > len(df[df[i]>seedsObject.seedsAvg]):
                    self.classification[f"{type}.{i}"] = "less"
                else:
                    self.classification[f"{type}.{i}"] = "more"
                self.classification[f"{type}.{i}.values"] = seedsObject

    def predict(self, sample: pd.DataFrame):
        result = {k: 0 for k in self.seedTypes}
        for type in self.seedTypes:
            for index, column in enumerate(self.columns):
                if self.classification[f"{type}.{column}"] == "less":
                    if self.classification[f"{type}.{column}.values"].seedsMin<\
                        sample[index] < self.classification[f"{type}.{column}.values"].seedsAvg:
                        result[type] += 1
                else:
                    if self.classification[f"{type}.{column}.values"].seedsMax>\
                        sample[index] > self.classification[f"{type}.{column}.values"].seedsAvg:
                        result[type] += 1
        return max(result, key=result.get)
    def score(self, test_X: pd.DataFrame) -> float:
        good = 0
        bad = 0
        for sample in test_X.values:
            if (x:=self.predict(sample)) == sample[-1]:
                good += 1
            else:
                bad +=1
        return good/(bad+good)*100


In [24]:

class ProcessingData:
    # zwracam dataframe w losowej kolejności i resetuje indeksy
    @staticmethod
    def shuffleDF(df: pd.DataFrame) -> pd.DataFrame:
        return df.iloc[random.sample(range(len(df)), len(df))].reset_index(drop=True)

    # normalizuje każdą kolumnę w dataframe (według przekazanej listy) metodą min max
    @staticmethod
    def normalizeDF(df: pd.DataFrame, columnNames: list) -> pd.DataFrame:
        for columnName in columnNames:
            df[columnName] = (df[columnName]-df[columnName].min())/(df[columnName].max()-df[columnName].min())
        return df

    # zwracam wiersze gdzie index jest mniejszy od długość df*wielkość df i wiersze gdzie index jest równy lub większy od długość *wielkość df 
    @staticmethod
    def splitDF(df: pd.DataFrame, trainSize: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
        return df[df.index < int(len(df)*trainSize)], df[df.index >= int(len(df)*trainSize)]

    @staticmethod
    def processData(df: pd.DataFrame, columnNames: list, trainSize: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
        df = ProcessingData.shuffleDF(df)
        df = ProcessingData.normalizeDF(df, columnNames)
        return ProcessingData.splitDF(df, trainSize)
    



In [40]:
# z normalizacją
classificator = Classificator()
train_X, test_X = ProcessingData.processData(seeds, seeds.columns[:-1], 0.8)
classificator.fit(train_X)
classificator.score(test_X)

75.0

In [34]:
# bez normalizacji
seeds = pd.read_csv("seeds.csv", sep=",")
seeds = ProcessingData.shuffleDF(seeds)
train_X, test_X = ProcessingData.splitDF(seeds, 0.8)
classificator = Classificator()
classificator.fit(train_X)
classificator.score(test_X)

67.5