In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score

from common.data_frame_columns import PM10, PM2_5, PM1
from common.date_time_helper import convert_to_datetime
from common.endpoints_urls import endpoints_config
from common.working_dataset_config import working_datetime_strings_5_months, test_date_time_strings
from data_management.data_crawler import DataManager
from data_management.data_reshaper import reshape_data, prepare_dataset, flatten_data
from data_management.labeled_data_generator import LabeledDataGenerator, DataLabel
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [5]:

datas = DataManager(True).get_all_endpoints_data(endpoints_config, update=False)

date_strings = working_datetime_strings_5_months
training_dates = [convert_to_datetime(date_strings[0]), convert_to_datetime(date_strings[1])]

test_dates_string = test_date_time_strings
test_dates = [convert_to_datetime(test_dates_string[0]), convert_to_datetime(test_dates_string[1])]

column = PM10
columns = [PM10]

L = LabeledDataGenerator(column)
prepared_data = L.generate_labeled_data(datas, training_dates[0], training_dates[1], 40)
test_data = L.generate_labeled_data(datas, test_dates[0], test_dates[1], 40)

Loading station data: Gronie  https://datahub.ki.agh.edu.pl/api/endpoints/70/data/
    # Minimal data: 2022-07-13 23:38:02+00:00
    # Maximal data: 2024-04-24 18:19:11+00:00
Loading station data: Urząd Gminy  https://datahub.ki.agh.edu.pl/api/endpoints/71/data/


  df = pd.read_csv(filename)


    # Minimal data: 2021-10-07 18:51:17+00:00
    # Maximal data: 2024-04-24 18:24:28+00:00
Loading station data: Młynne  https://datahub.ki.agh.edu.pl/api/endpoints/72/data/
    # Minimal data: 2021-10-07 19:17:59+00:00
    # Maximal data: 2024-04-24 18:21:45+00:00
Loading station data: Sucharskiego  https://datahub.ki.agh.edu.pl/api/endpoints/73/data/
    # Minimal data: 2021-10-07 19:41:43+00:00
    # Maximal data: 2024-04-24 18:21:44+00:00
Loading station data: Twardowskiego  https://datahub.ki.agh.edu.pl/api/endpoints/74/data/
    # Minimal data: 2021-10-07 20:59:56+00:00
    # Maximal data: 2024-04-24 18:21:51+00:00
Loading station data: Konopnickiej  https://datahub.ki.agh.edu.pl/api/endpoints/75/data/
    # Minimal data: 2021-10-07 21:07:07+00:00
    # Maximal data: 2024-04-24 18:22:09+00:00
Finished loading data 
 
Daily datas: 3090
Generated anomalies: 1300
    NORMAL: 1790
    EXTINCTION: 254
    ZEROS_IN_RANGE: 258
    NOISE: 257
    SCALED: 280
    RANDOM_ZEROS: 251

Daily

In [40]:
class KNNClassifier:
    def __init__(self, neighbours):
        self.knn = KNeighborsClassifier(n_neighbors=neighbours, metric='l1')

    def fit_data(self, labeled_data, columns):
        X,y = prepare_dataset(labeled_data, columns)
        X = flatten_data(X)
        self.knn.fit(X,y)

    def test_accuracy(self, labeled_data, columns):
        X_set,y_set = prepare_dataset(labeled_data, columns)
        X_set = flatten_data(X_set)
        predicted = self.knn.predict(X_set)
        accuracy = accuracy_score(y_set, predicted)
        print("Accuracy of KNN: ", accuracy)
        wrong_predicitons = [[DataLabel(p),DataLabel(y)] for p,y in zip(predicted, y_set) if p != y]
        print("[Predicted, Actual]")
        for error in wrong_predicitons:
            print(str(error[0]) + ", " + str(error[1]))
        return accuracy


In [43]:
knn = KNNClassifier(20)
knn.fit_data(prepared_data, columns)
knn.test_accuracy(test_data, columns)

Accuracy of KNN:  0.8018867924528302
[Predicted, Actual]
NORMAL, RANDOM_ZEROS
NORMAL, RANDOM_ZEROS
NORMAL, RANDOM_ZEROS
NORMAL, RANDOM_ZEROS
NORMAL, NOISE
NORMAL, NOISE
NORMAL, NOISE
NORMAL, NOISE
NORMAL, NOISE
NORMAL, NOISE
NORMAL, NOISE
NORMAL, NOISE
NORMAL, NOISE
EXTINCTION, NORMAL
EXTINCTION, NORMAL
NORMAL, RANDOM_ZEROS
NORMAL, SCALED
EXTINCTION, NORMAL
ZEROS_IN_RANGE, EXTINCTION
NORMAL, SCALED
ZEROS_IN_RANGE, RANDOM_ZEROS


0.8018867924528302