# Test모드 로그 처리 (추출, 정리, 저장)

In [11]:
import import_ipynb
from entire_logs import *
from utility import *
import pandas as pd
import os

# Colmun(try, note_idx) 추가

In [20]:
# test 데이터만 추출
def extract_test_df(df):
    return df[df["where"] == "Test"]

# 선생님별로 df 분할
def divide_per_teacher(df):
    df_list = []
    for teacher in list(set(df["teacher"])):
        sub_df = df[df["teacher"] == teacher]
        df_list.append(sub_df)
    return df_list

# df_list의 각 df에 대해 학생별로 분할
def divide_per_student(df_list):
    df_list2 = []
    for df in df_list:
        for student in list(set(df["student"])):
            sub_df = df[df["student"] == student]
            df_list2.append(sub_df)
    return df_list2

# 단일 df에 대해 try_num column추가
def marking_try_num(df):
    df = df.sort_values(by=["time"]) 
    exit_times = df[df["action"] == "ExitScene"]["time"]
    df_list = []
    for i, exit_time in enumerate(exit_times):
        sub_df = df[df["time"] <= exit_time]
        df = df[exit_time < df["time"]]
        sub_df["try"] = i+1
        
        
        if 1 <= len(sub_df[sub_df["action"] == "EnterScene"]):
            start_time = sub_df[sub_df["action"] == "EnterScene"].iloc[-1].time
#             print(start_time)
            sub_df1 = sub_df[sub_df["time"] < start_time].copy() # 불완전
            sub_df2 = sub_df[start_time <= sub_df["time"]] # 완전
            df_list.append(sub_df2)

            if 0 < len(sub_df1):
                sub_df1["try"] = -(i+1)
                df_list.append(sub_df1)
        else:
            df_list.append(sub_df)
        
    df = pd.concat(df_list)
    return df

# df list의 각 df에 대해 try_num column 추가
def mark_try(df_list):
    df_list2 = []
    for df in df_list:
        try:
            df = marking_try_num(df)
            df_list2.append(df)
        except:
            pass
    return df_list2

# df list의 각 df에 대해 try별로 분할
def divide_per_try(df_list):
    df_list2 = []
    for df in df_list:
        try_list = list(set(df["try"]))
        for try_num in try_list:
                sub_df = df[df["try"] == try_num]
                df_list2.append(sub_df)
    return df_list2

# 
def marking_note_num(df):
    df = df.sort_values(by=["time"])
    split_times = df[df["action"] == "ClearNote"]["time"]
    df_list = []
    for i, split_time in enumerate(split_times):
        sub_df = df[df["time"] <= split_time]
        df = df[split_time < df["time"]]
        sub_df["note_idx"] = i+1
        df_list.append(sub_df)
    
    df = pd.concat(df_list)
    return df

def mark_note(df_list):
    df_list2 = []
    for df in df_list:
        try:
            df = marking_note_num(df)
            df_list2.append(df)
        except:
            pass
    return df_list2

def extract_and_process_test_df(df):
#     df = origin_df
    df = extract_test_df(df)
    df_list = divide_per_teacher(df)
    df_list = divide_per_student(df_list)
    df_list = mark_try(df_list)
    df_list = divide_per_try(df_list)
    df_list = mark_note(df_list)
    return pd.concat(df_list)


# Colmun(answer pitch, pitch, volume) 추가

In [17]:
def divide_df_per_column(df, column):
    if type(df) == list:
        new_list = []
        for  element_df in df:
            new_list.extend(divide_df_per_column(element_df, column))
        return new_list        
    else:
        values = list(set(df[column]))
        return [df[df[column] == value].copy() for value in values]

def add_answer_pitch(df):
    if type(df) == list:
        return [add_answer_pitch(element_df) for element_df in df]
    else:
        sub_df = df[df["action"] == "StartNote"]
        if len(sub_df) > 0:
            option = sub_df.iloc[0].option
            answer_pitch = option_to_json(option)["pitch"]
        else:
            answer_pitch = 'x'
        df["answer_pitch"] = answer_pitch
        return df

def option_to_column(df, column):
    df = df.copy()
    df[column] = df["option"].apply(lambda x : option_to_json(x)[column])
    return df

def extract_columns(df):
    df_list = [df]
    df_list = divide_df_per_column(df_list, "teacher")
    df_list = divide_df_per_column(df_list, "student")
    df_list = divide_df_per_column(df_list, "try")
    df_list = divide_df_per_column(df_list, "note_idx")
    
    df_list = add_answer_pitch(df_list)
    
    df_list = [df[df["action"] == "Microphone"] for df in df_list] 
    df_list = [option_to_column(df, "pitch") for df in df_list]
    df_list = [option_to_column(df, "volume") for df in df_list]

    # 예외 처리: answer_pitch가 "x" 인 경우 제거
    df_list = [df[df["answer_pitch"] != "x"] for df in df_list]
    df_list = [df for df in df_list if 0 < len(df)]
    
    rest_df = df[df["action"] != "Microphone"]
    df_list.append(rest_df)
    df = pd.concat(df_list)
    
    return df.sort_values(["teacher", "student", "time"])

In [18]:
# 원본 df로 부터 test 로그만 추출, 정리, 저장 후 반환
# cache기능 제공
def get_test_df():
    cache_file_path = "data/processed/processed_test_df.csv"
    if os.path.isfile(cache_file_path):
        return pd.read_csv(cache_file_path, encoding="cp949", index_col = 0)
    
    df = get_entire_df()
    df = extract_and_process_test_df(df)
    df = extract_columns(df)    
    df.to_csv(cache_file_path, encoding="cp949")
    return df

# 수행

In [22]:
test_df = get_test_df()
test_df

[1/4] Checking cache
Loading saved file


Unnamed: 0,time,where,action,option,teacher,student,try,note_idx,answer_pitch,pitch,volume
30382,2022-11-29-20-08-34-223,Test,EnterScene,"{musicName:여행을 떠나요, pitchAdjust:0, AnswerRange...",선우-이승희,Test,1,1,,,
30383,2022-11-29-20-08-37-260,Test,StartNote,"{pitch:REST, length:Note8}",선우-이승희,Test,1,1,,,
30384,2022-11-29-20-08-37-261,Test,Microphone,"{pitch:REST, volume:55.54912}",선우-이승희,Test,1,1,REST,REST,55.54912
30385,2022-11-29-20-08-37-264,Test,Microphone,"{pitch:REST, volume:55.54912}",선우-이승희,Test,1,1,REST,REST,55.54912
30386,2022-11-29-20-08-37-266,Test,Microphone,"{pitch:REST, volume:55.54912}",선우-이승희,Test,1,1,REST,REST,55.54912
...,...,...,...,...,...,...,...,...,...,...,...
868989,2023-01-03-13-15-02-914,Test,Microphone,"{pitch:F2, volume:67.22172}",충주성심학교(초4),정다찬,32,54,A3,F2,67.22172
868990,2023-01-03-13-15-02-939,Test,Microphone,"{pitch:F2, volume:67.22172}",충주성심학교(초4),정다찬,32,54,A3,F2,67.22172
868991,2023-01-03-13-15-02-969,Test,Microphone,"{pitch:F2, volume:67.22172}",충주성심학교(초4),정다찬,32,54,A3,F2,67.22172
868992,2023-01-03-13-15-03-006,Test,Microphone,"{pitch:F2, volume:67.22172}",충주성심학교(초4),정다찬,32,54,A3,F2,67.22172


# 확인

In [24]:
teacher = "성심-김원석(로그)"
student = "윤예준"
try_num = 1
note_idx = 8
sub_df = test_df[(test_df["teacher"]== teacher) & 
        (test_df["student"]== student) & 
        (test_df["try"]== try_num) & 
        (test_df["note_idx"]== note_idx)]

sub_df

Unnamed: 0,time,where,action,option,teacher,student,try,note_idx,answer_pitch,pitch,volume
68941,2022-12-15-14-09-41-471,Test,StartNote,"{pitch:C4, length:Note8}",성심-김원석(로그),윤예준,1,8,,,
68942,2022-12-15-14-09-41-472,Test,Microphone,"{pitch:G3s, volume:76.12896}",성심-김원석(로그),윤예준,1,8,C4,G3s,76.12896
68943,2022-12-15-14-09-41-480,Test,Microphone,"{pitch:G3s, volume:76.12896}",성심-김원석(로그),윤예준,1,8,C4,G3s,76.12896
68944,2022-12-15-14-09-41-493,Test,Microphone,"{pitch:G3s, volume:76.12896}",성심-김원석(로그),윤예준,1,8,C4,G3s,76.12896
68945,2022-12-15-14-09-41-510,Test,Microphone,"{pitch:G3s, volume:76.12896}",성심-김원석(로그),윤예준,1,8,C4,G3s,76.12896
...,...,...,...,...,...,...,...,...,...,...,...
68998,2022-12-15-14-09-42-394,Test,Microphone,"{pitch:D2s, volume:100.6985}",성심-김원석(로그),윤예준,1,8,C4,D2s,100.6985
68999,2022-12-15-14-09-42-410,Test,Microphone,"{pitch:D2s, volume:100.6985}",성심-김원석(로그),윤예준,1,8,C4,D2s,100.6985
69000,2022-12-15-14-09-42-427,Test,Microphone,"{pitch:D2s, volume:100.6985}",성심-김원석(로그),윤예준,1,8,C4,D2s,100.6985
69001,2022-12-15-14-09-42-445,Test,Microphone,"{pitch:D2s, volume:100.6985}",성심-김원석(로그),윤예준,1,8,C4,D2s,100.6985
