# 전체 데이터 통합 및 로드

In [15]:
import os
import json
import pandas as pd
from datetime import datetime
import import_ipynb
import utility

In [16]:
# dir_path에 있는 모든 txt파일로부터 log 데이터만 추출하고 concat 하여 반환
def add_name_column(df):
    login_logs = df[df["action"] == "Login"]
    login_time_list =  list(login_logs["time"])
    
    user_name_list = [option_to_json(option)["userName"] for option in login_logs["option"]]
    login_time_list.reverse()
    user_name_list.reverse()
    df_list = []
    for i, login_time in enumerate(login_time_list):
        sub_df, df = df[login_time <= df["time"]].copy(), df[df["time"] < login_time]
        sub_df["name"] = user_name_list[i]
        df_list.append(sub_df)
        
    return pd.concat(df_list)

def get_df_from_txt_files(dir_path):
    file_names = os.listdir(dir_path)
    data_list = []
    for file_name in file_names:
        with open(os.path.join(dir_path, file_name), "r", encoding="UTF-8") as f:
            lines = f.readlines()
            
            # 일반 로그와 pretest 중 일반 로그만 추출
            for line in lines:
                line = json.loads(line)
                if "log" in line:
                    data_list.append(line["log"])
    return pd.DataFrame(data_list)   

# root의 모든 데이터를 통합하여 df로 반환
# 1) root_path내에 모든 directoryq별로 directory column과 함께 df 생성
# 2) 모든 df를 concat하여 반환 
def get_df_from_data(root_path):
    df_list = []
    for dir_name in os.listdir(root_path):
        print("Loading From Direcoty \"%s\""%(dir_name))
        dir_path = os.path.join(root_path, dir_name)
        df = get_df_from_txt_files(dir_path)
        df["dir"] = dir_name
        df = add_name_column(df)
        df_list.append(df)
    
    return pd.concat(df_list)

# 전체 데이터를 답은 df반환
# 캐쉬 기능 제공
def get_entire_df():
    # 경로 계산
    raw_dir_path = "data/raw"
    processed_file_path = "data/processed/total_df.csv"
    
    print("[1/4] Checking cache")
    # 파일이 있으면 반환
    if os.path.isfile(processed_file_path):
        print("Loading saved file")
        return pd.read_csv(processed_file_path, encoding="cp949", index_col = 0)
    
    # 없으면 생성, 정리, 저장 후 반환
    print("[2/4] Loading...")
    df = get_df_from_data(raw_dir_path)
    
    print("[3/4] Pre processing...")
    df = df[["dir", "name", "time", "where", "action", "option"]]
    df = df.sort_values(["dir", "name", "time"])
    #df["time"].apply(lambda x : datetime.strptime(x, '%Y-%m-%d-%H-%M-%S-%f').microsecond)
    df["teacher"], df["student"] = df["dir"], df["name"]
    del df["dir"], df["name"]
    
    print("[4/4] Saving...")
    df.to_csv(processed_file_path, encoding = "cp949")
    return df

In [14]:
# entire_df = get_entire_df()
# entire_df

[1/4] Checking cache
Loading saved file


Unnamed: 0,time,where,action,option,teacher,student
0,2022-11-29-20-05-15-524,AccountMenu,Login,"{userId:0, userName:Test, fromPitch:C3, toPitc...",선우-이승희,Test
1,2022-11-29-20-05-43-321,PuppetfaceExplore,EnterScene,{},선우-이승희,Test
2,2022-11-29-20-05-48-334,PuppetfaceExplore,MakeSound,"{pitch:REST, volume:75.77242}",선우-이승희,Test
3,2022-11-29-20-05-48-336,PuppetfaceExplore,MakeSound,"{pitch:REST, volume:75.77242}",선우-이승희,Test
4,2022-11-29-20-05-48-337,PuppetfaceExplore,MakeSound,"{pitch:REST, volume:75.77242}",선우-이승희,Test
...,...,...,...,...,...,...
868992,2023-01-03-13-15-03-006,Test,Microphone,"{pitch:F2, volume:67.22172}",충주성심학교(초4),정다찬
868993,2023-01-03-13-15-03-010,Test,ClearNote,{},충주성심학교(초4),정다찬
868994,2023-01-03-13-15-03-057,Test,ExitScene,{},충주성심학교(초4),정다찬
868995,2023-01-03-13-15-09-179,MainMenu,Logout,"{userId:5, userName:정다찬}",충주성심학교(초4),정다찬
