In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

import py_scripts.db_fcns as db
import py_scripts.data_fcns as fcns

In [None]:
alkuasema = "JÄS"
loppuasema = "OV"
# dates = pd.date_range("2023-01-01", periods=2, freq="1D")
# dates = pd.date_range("2022-04-01", "2022-09-30", freq="1D")
dates = pd.date_range("2018-01-01", "2023-06-15", freq="1D")

In [None]:
def get_timetables(alkuasema, loppuasema, dates):
    timetables = pd.DataFrame()
    for date in dates:
        new_table = fcns.get_train_nums(alkuasema, loppuasema, str(date.date()))
        if new_table is None:
            continue
        new_table["departureDate"] = str(date.date())
        timetables = pd.concat([timetables, new_table])
    return timetables

## Haetaan data

In [None]:
timetables = get_timetables(alkuasema, loppuasema, dates)
# trains = timetables.apply(lambda r: (int(r["trainNumber"]), r["departureDate"]), axis=1).unique()
trains = timetables.groupby(["trainNumber", "departureDate"]).count().index
print(f"Junia {len(trains)} kpl")
df = fcns.get_location_data_for_trains(trains, alkuasema, loppuasema, sleeptime=0.2)

In [None]:
trains_in_df = df.groupby(["trainNumber", "departureDate"]).count().index
len(trains_in_df)

## Poistetaan mahdollisesti ongelmallinen data

In [None]:
# poistetaan junat, joilla ei ole sijanti-/nopeustietoja korkeintaan 30 sekunnin välein
max_time_difference = 30

cleaned_df = pd.DataFrame()
for train_num, date in df.groupby(["trainNumber", "departureDate"]).count().index:
    t_df = fcns.get_locations_for_train(train_num, date, df).copy()
    max_time_jump = t_df["duration"].diff(1).max()
    if max_time_jump <= max_time_difference:
        cleaned_df = pd.concat([cleaned_df, t_df])

df = cleaned_df.reset_index(drop=True)

distances = fcns.get_distances_from_df(df)

# poistetaan junat, joiden kulkema matka poikkeaa liikaa mediaanista
min_dist = distances["dist_from_speed"].median() - 4 * (distances["dist_from_speed"].median() - distances["dist_from_speed"].quantile(0.02))
max_dist = distances["dist_from_speed"].median() + 4 * (distances["dist_from_speed"].quantile(0.98) - distances["dist_from_speed"].median())
outliers = distances[(distances["dist_from_speed"] < min_dist) | (distances["dist_from_speed"] > max_dist)].set_index(["trainNumber", "departureDate"]).index

for train_num, date in outliers:
    o = fcns.get_locations_for_train(train_num, date, df)
    i1 = o.index.min()
    i2 = o.index.max() + 1
    df.drop(range(i1, i2), inplace=True)

df.reset_index(drop=True, inplace=True)

In [None]:
len(df.groupby(["trainNumber", "departureDate"]).count().index)

In [None]:
df.info()

In [None]:
distances = fcns.get_distances_from_df(df)
distances["duration"] = fcns.get_durations_from_df(df).to_numpy()

In [None]:
distances.describe()

## Tallennetaan data databaseen

In [None]:
db.save_df_to_db(df, "clean_jamsa_orivesi", to_extra=True)