In [None]:
import os
from copy import deepcopy
from tqdm import tqdm
from datetime import date, datetime
import pandas as pd
import numpy as np

In [None]:
df_all = []
file_lst_used = []
file_lst = os.listdir('NMXLNT')
for file_name in file_lst:
    if "VINH NIEM" not in file_name and "HO TAY" not in file_name:
        df = pd.read_csv(f"NMXLNT/{file_name}")
        df = df.drop_duplicates().reset_index().drop("index", axis = 1) # after some inspection, realize there's some duplicates
        df_all.append(df)
        file_lst_used.append(file_name)
        print(file_name)
        print(df.columns)

In [None]:
lag_used = [4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 10.5, 11, 11.5, 12]

# try to plot cod and find error data in this
def find_prev_mean(row, df_copy, num_minutes):
    # find the mean of last num_minutes minutes
    begin_time, end_time = row["datetime"] - pd.Timedelta(minutes = num_minutes), row["datetime"]
    past_mean = df_copy[(df_copy["datetime"] >= begin_time) & (df_copy["datetime"] < end_time)]["cod"].mean()
    if pd.isna(past_mean) or past_mean == 0:
        return True
    return abs(row["cod"] / past_mean - 1) > 0.2

def preprocessing(df, file_name):
    df["Location"] = file_name
    df["datetime"] = pd.to_datetime(df["datetime"])
    df["minute"] = df["datetime"].dt.minute
    df["sin_minute"] = np.sin(df["minute"] / 60 * 2 * np.pi)
    df["hour"] = (df["datetime"].dt.hour)
    df["sin_hour"] = np.sin(df["hour"] / 24 * 2 * np.pi)
    #df["dayofweek"] = (df["datetime"].dt.dayofweek)
    df["day"] = (df["datetime"].dt.day)
    df["sin_day"] = np.sin(df["day"] / 31 * 2 * np.pi)
    df["month"] = (df["datetime"].dt.month)
    df["sin_month"] = np.sin(df["month"] / 12 * 2 * np.pi)
    df = df[~df["cod"].isna()].reset_index().drop("index", axis = 1)
    df = df[df["cod"] > 0].reset_index().drop("index", axis = 1) # assumption of cod > 0
    if "flow_in" not in df.columns:
        df["flow_in"] = df["flow_in1"] + df["flow_in2"]
    #print(df["cod"].isna().sum())
    df = df.reset_index().drop("index", axis = 1)
    temp_df = deepcopy(df)
    last_row_with_time_within = {
        j: [] for j in lag_used # store between 4 - 8 hours since we are trying to predict the next 4-8 hours
    }
    curr_row = {
        j: 0 for j in lag_used
    }
    for i in tqdm(range(df.shape[0])):
        for j in lag_used:
            # we first try to find until we find first instance that is STRICTLY LESS THAN k hours before current time
            # then the previous instance is the instance that might before around 4 hours before current time
            while curr_row[j] < i and (df.loc[i, "datetime"] - df.loc[curr_row[j], "datetime"]) / pd.Timedelta(minutes=1) >= 60 * j:
                curr_row[j] += 1
            # when we stop is when we do not see a fit row, try to check if that fit row actually fit 
            # a fit would be from k - < k+1 hours before (curr_row - 1 would be our answer)
            if curr_row[j] <= i and curr_row[j] >= 1 and \
               (df.loc[i, "datetime"] - df.loc[curr_row[j] - 1, "datetime"]) / pd.Timedelta(minutes=1) >= 60 * j and \
               (df.loc[i, "datetime"] - df.loc[curr_row[j] - 1, "datetime"]) / pd.Timedelta(minutes=1) < 60 * (j+1):
                last_row_with_time_within[j].append(curr_row[j] - 1)
            else:
                # we stop because we have no answer
                last_row_with_time_within[j].append(None)
    for j in lag_used:
        df[f"last_row_with_time_within_{j}hour"] = last_row_with_time_within[j]
    # need to separete these two steps since after we inner join, we will lose some rows
    for j in lag_used:
        df2 = deepcopy(temp_df.reset_index())
        df = df.merge(df2, how = "inner", left_on = f"last_row_with_time_within_{j}hour", right_on = "index", suffixes=("", f"_prev_{j}"))
        #print(np.mean(((df["datetime"] - df[f"datetime_prev_{j}"]) / pd.Timedelta(minutes=1)) >= 60*j))
    # print(df.shape[0])
    # df["zscore"] = (df["cod"] - df["cod"].rolling(50).mean()) / df["cod"].rolling(50).std()
    # df = df[df["zscore"].abs() <= 6.5].reset_index().drop("index", axis = 1)
    # print(df.shape[0])
    temp = deepcopy(df)
    df["abnormal"] = df.apply(lambda x: find_prev_mean(x, temp, 30), axis = 1)
    df = df[~df["abnormal"]].reset_index().drop("index", axis = 1)
    return df

In [None]:
for i in range(len(df_all)):
    df_all[i] = preprocessing(df_all[i], file_lst_used[i])
df_all = pd.concat(df_all, ignore_index = True)
df_all.to_csv("NMXLNT_df.csv", index = False)