## ML 

In [1]:
import os
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras import datasets, models, layers, utils, activations, losses, optimizers, metrics # DNN 系列
from tensorflow.keras.layers import Dense, LSTM,Bidirectional,SimpleRNN,GRU,Activation # RNN系列
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
import datetime
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor

#### 收集&整理數據路徑
##### 期望得到: 
* target_sno: 提交csv所需sno
* date_dictionary: 包含train~test範圍的 data，以及對應有沒有得到資料
* sno_dictionary: 包含sno與其資料的對應路徑

In [2]:
## 確認主要想預測的 sno
def collect_target_sno(_path):
    '''
    Readme
    
    > input:
        _path # 提交格式csv檔案路徑
    
    > output
        target_sno # 提交csv所需sno
    
    '''
    sample_csv = pd.read_csv(_path)
    target_sno = {each.split('_')[1] for each in sample_csv['id']}
    print('finish target_sno: ', len(target_sno)) # 期望數量
    return list(target_sno)

## 收集&整理數據
def collect_organize_data(release_path, date_train):
    '''
    Readme
    
    input:
    release_path # data主資料夾路徑
    date_train # 期望會有data的日期
    
    output:
    date_dictionary # 包含train~test範圍的 data，以及對應有沒有得到資料
    sno_dictionary # 包含sno與其資料的對應路徑
    '''
    
    # phase1 : 遍歷資料夾，以list方式儲存資訊，資訊包含 "有data的日期"、"資料包含的車站"、"各車站下所有檔案路徑"
    date_get_data = [] # 實際上有data的日期
    file_paths_station, file_paths_station_date = [], [] # 資料包含的車站, 各車站下所有檔案路徑
    for root, dirs, files in os.walk(release_path): # 遍歷資料夾
        if root != release_path: # 是具有日期的path
            date = root.replace(release_path,'') # 保留日期 
            if date in date_train: # 檢查是否存在
                date_get_data.append(date) # 儲存日期
        for file in files: # 每個子資料夾下的檔案
            if file.endswith('.json'): # 是".json"檔
                sno = file.replace('.json','') # 確認sno號(車站號)
                file_path = os.path.join(root, file)
                if sno not in file_paths_station: # 如果是新的車站
                    file_paths_station.append(sno) # 加入車站號
                    file_paths_station_date.append([file_path])
                else:
                    station_position = file_paths_station.index(sno)
                    file_paths_station_date[station_position].append(file_path)
    # phase2-1: 檔案整理_date
    date_dictionary = {}
    for each_date in date_train:
        get_data = each_date in date_get_data # 有 get到date則True否則False
        date_dictionary[each_date] = get_data
    
    # phase2-2: 檔案整理_sno
    sno_dictionary = {}
    for sno_order, sno in enumerate(file_paths_station):
        sno_dictionary[sno] = file_paths_station_date[sno_order]
    
    print('Complete collection & organization of data', flush=True)
    return  date_dictionary,  sno_dictionary   

In [4]:
5264 / 4

1316.0

In [3]:
# input path
project_path = 'D:/在職進修/修課/機器學習/Final_project/'
release_path = project_path+'html.2023.final.data-release/release/'
sample_csv_list = ['sample_submission_stage'+str(eg) for eg in range(1,4)]

# collect_target_sno
target_sno = collect_target_sno(project_path+sample_csv_list[0]+'.csv')

# collect_organize_data
date_ranges = [range(20231002, 20231032), range(20231101, 20231131), range(20231201, 20231226)]
date_train = [str(date_int) for r in date_ranges for date_int in r]
date_dictionary, sno_dictionary = collect_organize_data(release_path, date_train)

finish target_sno:  112
Complete collection & organization of data


#### 讀取 & 清理數據-缺失值處理

In [4]:
def replace_negative_ones(df, columns):
    for col in columns:
        df[col] = df[col].replace(-1, np.nan).fillna(method='ffill').fillna(method='bfill')

def day_of_week(year_month_day):
    year, month, day = int(year_month_day[:4]), int(year_month_day[4:6]), int(year_month_day[6:])
    return datetime.date(year, month, day).strftime("%A")

def time_feature(date, timestamp_each):
    hour, minute = map(int, timestamp_each.split(':'))
    minuate_accumulation = hour * 60 + minute
    day = day_of_week(date)
    day_code = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"].index(day) + 1
    holiday = -1 if day_code <= 5 else 1
    if date in ["20231010","20231225"]:
        holiday = 1 
    return hour, minute, minuate_accumulation, day_code, holiday

def process_file(sno_name, file_list, save_path, station_line=500101001):
    date_list, sno_name_list, hour_list, minute_list, minute_accumulation_list, day_code_list, holiday_list, tot_list, act_list, sbi_list, index_time = [], [], [], [], [], [], [], [], [], [], []

    for file_path in file_list:
        date = file_path.split('/')[-1].split('\\')[0]
        sno = file_path.split('\\')[1].split('.')[0]

        with open(file_path, 'r') as f:
            data = json.load(f)

        pre_tot = pre_act = pre_sbi = -1
        for timestamp_each in data.keys():
            hour, minute, minute_accumulation, day_code, holiday = time_feature(str(date), timestamp_each)

            date_list.append(date)
            sno_name_list.append(int(sno) / station_line)
            hour_list.append(hour)
            minute_list.append(minute)
            minute_accumulation_list.append(minute_accumulation)
            day_code_list.append(day_code)
            holiday_list.append(holiday)
            index_time.append(f"Time:{date}{minute_accumulation}")

            tot = data[timestamp_each].get('tot', pre_tot if pre_tot != -1 else -1)
            act = data[timestamp_each].get('act', pre_act if pre_act != -1 else -1)
            sbi = data[timestamp_each].get('sbi', pre_sbi if pre_sbi != -1 else -1)

            pre_tot, pre_act, pre_sbi = tot, act, sbi
            tot_list.append(tot)
            act_list.append(act)
            sbi_list.append(sbi)

    train_feature1 = pd.DataFrame({
        'date': date_list, 'sno': sno_name_list, 'hour': hour_list, 'minute': minute_list,
        'minute_accumulation': minute_accumulation_list, 'day_code': day_code_list, 'holiday': holiday_list
    })

    train_feature2 = pd.DataFrame({'tot_acc': tot_list, 'act_acc': act_list, 'sbi_acc': sbi_list})
    replace_negative_ones(train_feature2, ['tot_acc', 'act_acc', 'sbi_acc'])

    train_feature = pd.concat([train_feature1, train_feature2], axis=1)
    train_feature.index = index_time
    train_feature.to_json(os.path.join(save_path, f'{sno_name}.json'))

def process_raw_data_into_minute_by_minute_features(sno_dictionary, save_path, station_line=500101001):
    with ThreadPoolExecutor(max_workers=5) as executor:
        for sno_name, file_list in sno_dictionary.items():
            if not os.path.exists(os.path.join(save_path, f'{sno_name}.json')):
                executor.submit(process_file, sno_name, file_list, save_path, station_line)


In [5]:
save_path_phase1 = project_path+'data_science_phase1_data/'

In [8]:
process_raw_data_into_minute_by_minute_features(sno_dictionary, save_path_phase1, station_line=500101001)

#### 特徵工程

###### feature1_day
* feature
    * Station characteristics * 1
    * Station upper bound * 1
    * date characteristics * 2 (星期、放假)
    * time characteristics * 3 (小時、分鐘、累積)
    * 1 day time step 72 * 1
* --> input 72 * 8
* --> pred 72 * 1

In [1]:
24*60

1440

In [46]:
save_path_phase2 = project_path+'data_science_phase2_feature/feature1_day/'

In [51]:
sno

'500106024'

In [64]:
# 讀出每一筆 phase1 data
X_train_all = []
y_train_all = []
X_test_all = []
y_test_all = []
X_test_all_target_sno = []
y_test_all_target_sno = []
non_all_date_sno = []
for sno in tqdm(list(sno_dictionary.keys())[:-10]): 
    load_path = save_path_phase1 + sno + '.json'
    save_path = save_path_phase2 + sno + '.json'
    try:
        X_train = np.load(save_path_phase2+sno+'_X_train.npy') 
        y_train = np.load(save_path_phase2+sno+'_y_train.npy') 
        X_test = np.load(save_path_phase2+sno+'_X_test.npy') 
        y_test = np.load(save_path_phase2+sno+'_y_test.npy') 
        X_train_all.extend(X_train)
        y_train_all.extend(y_train)
        X_test_all.extend(X_test)
        y_test_all.extend(y_test)
        if sno in target_sno: # 是想關注的sno 才做為 testing set
            X_test_all_target_sno.extend(X_test)
            y_test_all_target_sno.extend(y_test)
        need_run = False
    except:
        need_run = True

    if need_run:
        with open(load_path, 'r') as f:
            data = json.load(f)

        sorted_features_df = pd.DataFrame(data)
        
        # classify data-feature
        def create_date_pairs(date_dict, date_range):
            date_pairs = {}
            dates = list(date_dict.keys())[date_range]
            for i in range(len(dates) - 1):
                if date_dict[dates[i]] and date_dict[dates[i + 1]]:
                    date_pairs[dates[i]] = dates[i + 1]
            return date_pairs

        train_date = create_date_pairs(date_dictionary, slice(None, -8))
        test_date = create_date_pairs(date_dictionary, slice(-9, -1))

        X_train_date = [sorted_features_df[sorted_features_df['date'] == train_date_each] for train_date_each in train_date.keys()]
        y_train_date = [sorted_features_df[sorted_features_df['date'] == train_date[train_date_each]] for train_date_each in train_date.keys()]

        X_test_date = [sorted_features_df[sorted_features_df['date'] == test_date_each] for test_date_each in test_date.keys()]
        y_test_date = [sorted_features_df[sorted_features_df['date'] == test_date[test_date_each]] for test_date_each in test_date.keys()]

        def create_date_sets(X_date, step=20):
            date_sets_all = []
            for X_date_each in X_date:
                now_focus_df = X_date_each.copy()
                # now_focus_df['normalize_sno'] = now_focus_df['sno']
                # now_focus_df['normalize_tot_acc'] = now_focus_df['tot_acc']
                now_focus_df['normalize_day_code'] = now_focus_df['day_code'] / 7
                # now_focus_df['normalize_holiday'] = now_focus_df['holiday']
                now_focus_df['normalize_hour'] = now_focus_df['hour'] / 23
                now_focus_df['normalize_minute'] = now_focus_df['minute'] / 59
                now_focus_df['normalize_minute_accumulation'] = now_focus_df['minute_accumulation'] / 1439
                now_focus_df['normalize_sbi'] = now_focus_df['sbi_acc'] / now_focus_df['tot_acc']

                date_sets = []
                for each_start in range(0, step):
                    loc_list = now_focus_df.index[each_start::step].tolist()
                    date_set_one = now_focus_df.loc[loc_list]
                    date_sets.append(date_set_one)
                date_sets_all.append(date_sets)
            return date_sets_all

        date_X_training_set = create_date_sets(X_train_date)
        date_y_training_set = create_date_sets(y_train_date)
        date_X_testing_set = create_date_sets(X_test_date)
        date_y_testing_set = create_date_sets(y_test_date)

        # 提取訓練數據
        X_train = np.array([[df['sno'].values, df['tot_acc'].values, df['normalize_day_code'].values, df['holiday'].values, df['normalize_hour'].values, df['normalize_minute'].values, df['normalize_minute_accumulation'].values, df['normalize_sbi'].values] for dataset in date_X_training_set for df in dataset])
        y_train = np.array([[df['sno'].values, df['tot_acc'].values, df['normalize_day_code'].values, df['holiday'].values, df['normalize_hour'].values, df['normalize_minute'].values, df['normalize_minute_accumulation'].values, df['normalize_sbi'].values] for dataset in date_y_training_set for df in dataset])

        # 提取測試數據
        X_test = np.array([[df['sno'].values, df['tot_acc'].values, df['normalize_day_code'].values, df['holiday'].values, df['normalize_hour'].values, df['normalize_minute'].values, df['normalize_minute_accumulation'].values, df['normalize_sbi'].values] for dataset in date_X_testing_set for df in dataset])
        y_test = np.array([[df['sno'].values, df['tot_acc'].values, df['normalize_day_code'].values, df['holiday'].values, df['normalize_hour'].values, df['normalize_minute'].values, df['normalize_minute_accumulation'].values, df['normalize_sbi'].values] for dataset in date_y_testing_set for df in dataset])
        
        try:
            X_train = np.transpose(X_train, (0, 2, 1))
            y_train = np.transpose(y_train, (0, 2, 1))
            X_test = np.transpose(X_test, (0, 2, 1))
            y_test = np.transpose(y_test, (0, 2, 1))
            size_correct = True
        except:
            size_correct = False
            non_all_date_sno.append(sno)
        if size_correct:
            np.save(save_path_phase2+sno+'_X_train.npy', X_train)
            np.save(save_path_phase2+sno+'_y_train.npy', y_train)
            np.save(save_path_phase2+sno+'_X_test.npy', X_test)
            np.save(save_path_phase2+sno+'_y_test.npy', y_test)
        
            X_train_all.extend(X_train)
            y_train_all.extend(y_train)
            X_test_all.extend(X_test)
            y_test_all.extend(y_test)
            if sno in target_sno: # 是想關注的sno 才做為 testing set
                X_test_all_target_sno.extend(X_test)
                y_test_all_target_sno.extend(y_test)
        
np.save(save_path_phase2+'X_train.npy', np.array(X_train_all))
np.save(save_path_phase2+'y_train.npy', np.array(y_train_all))
np.save(save_path_phase2+'X_test.npy', np.array(X_test_all))
np.save(save_path_phase2+'y_test.npy', np.array(y_test_all)) 
np.save(save_path_phase2+'X_test_target_sno.npy', np.array(X_test_all_target_sno))
np.save(save_path_phase2+'y_test_target_sno.npy', np.array(y_test_all_target_sno)) 

100%|████████████████████████████████████████████████████████████████████████████| 1317/1317 [1:44:43<00:00,  4.77s/it]


In [203]:
X_train = np.load(save_path_phase2+'_X_train.npy') 
y_train = np.load(save_path_phase2+'_y_train.npy') 
X_test = np.load(save_path_phase2+'_X_test_target_sno.npy') 
y_test = np.load(save_path_phase2+'_y_test_target_sno.npy') 

In [204]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((1764780, 72), (1764780, 72), (15680, 72), (15680, 72))