## 1. Setting

In [4]:
import os
PROJECT_PATH = "./.."
os.chdir(project_path)

import datetime
import requests
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from lib.config import *
import lib.inputManager as IM
import lib.statistic as STAT
import lib.jeju as JEJU
from lib.jeju import *

# 한글 폰트 설정
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

## 2. Data Loading

In [28]:
user_df = pd.read_csv(USER_DF_PATH, encoding = DEFAULT_ENCODING)
# jeju statistical data
jeju_user_num_df = JEJU.get_jeju_tourist_num_df(TOURIST_NUM_STATS_DATA, DOMESTIC_BUS_USAGE_RATIO, FOREIGN_BUS_USAGE_RATIO)
jeju_user_num_df.loc["total"] = jeju_user_num_df.sum() # 합계 계산
jeju_stay_period_df = JEJU.get_jeju_stay_period_df(STAY_PERIOD_STATS_DATA, jeju_user_num_df)

In [32]:
jeju_stay_period_df

Unnamed: 0,period,domestic_tourist,foreign_tourist,total_tourist
0,1,0.003,0.006,0.004418
1,2,0.057,0.066,0.061253
2,3,0.618,0.28,0.458272
3,4,0.238,0.296,0.265409
4,5,0.045,0.217,0.126282
5,6,0.013,0.055,0.032848
6,7,0.015,0.035,0.024451
7,8,0.003,0.019,0.010561
8,9,0.001,0.006,0.003363
9,10,0.001,0.004,0.002418


## 3. Tuning

#### 3.1 Data generation

In [33]:
# get_tuning_df()에서 모든 경우의 수에 대해 1번씩 호출되는 함수
# 제주 체류시간 통계자료와의 유사도를 반환
def func(df):
    ration_df = STAT.get_ratio_df_from_one_df(df, "period", "ratio")
    merged_stay_period_df = pd.merge(ration_df, jeju_stay_period_df, on="period", how="inner")
    result = 0
    try:
        result = STAT.how_much_overlap(merged_stay_period_df, "total_tourist", "ratio")
    finally:
        return result

def get_tuning_df(df, orders, func, result_column):
    column = orders[0]["column"]
    result_df_list = []
    if len(orders) == 1:
        if "values" in orders[0].keys():
            for value in orders[0]["values"]:
                target_df = df[df[column] == value]
                result = func(target_df)
                result_df = pd.DataFrame(columns=[column, result_column])
                result_df.loc[0] = [value, result]
                result_df_list.append(result_df)
        else:
            for r in orders[0]["ranges"]:
                target_df = df[(r[0] <= df[column]) & (df[column] < r[1])]
                result = func(target_df)
                result_df = pd.DataFrame(columns=[column, result_column])
                result_df.loc[0] = ["(%d, %d)"%(r[0], r[1]), result]
                result_df_list.append(result_df)
    elif len(orders) > 1:
        if "values" in orders[0].keys():
            for value in orders[0]["values"]:
                target_df = df[df[column] == value]
                result_df = get_tuning_df(target_df, orders[1:], func, result_column)
                result_df[column] = value
                result_df_list.append(result_df)
        else:
            for r in orders[0]["ranges"]:
                target_df = df[(r[0] <= df[column]) & (df[column] < r[1])]
                result_df = get_tuning_df(target_df, orders[1:], func, result_column)
                result_df[column] = "(%d, %d)"%(r[0], r[1])
                result_df_list.append(result_df)
                
    # 결과 df병합
    result_df = pd.concat(result_df_list)
    
    # 컬럼 순서 재배치
    columns = list(result_df.columns)
    columns.remove(column)
    columns = [column]+columns
    
    result_df.reset_index(inplace=True)
    del result_df["index"]
    return result_df[columns]    

ranges = []
for i in range(0, 20, 1):
    ranges.append((i, 20))
    
orders = [{"column":"case", "values":["both", "first", "last", "neither"]},
          {"column":"tour_count", "ranges":ranges}]

result_column = "similarity"

tuning_df = get_tuning_df(user_df, orders, func, result_column)

#### 3.2 Tuning data analysis

In [34]:
tuning_df[tuning_df["case"] == "neither"]

Unnamed: 0,case,tour_count,similarity
60,neither,"(0, 20)",0.173146
61,neither,"(1, 20)",0.189464
62,neither,"(2, 20)",0.209542
63,neither,"(3, 20)",0.22423
64,neither,"(4, 20)",0.239066
65,neither,"(5, 20)",0.247682
66,neither,"(6, 20)",0.245802
67,neither,"(7, 20)",0.244959
68,neither,"(8, 20)",0.239519
69,neither,"(9, 20)",0.231897


#### 3.3 Result

In [35]:
case_list = ["both", "first", "last", "neither"]

result_df = pd.DataFrame(columns=["tour_count"])

for case in case_list:
    df = tuning_df[tuning_df["case"] == case]
    df = df[["tour_count", "similarity"]]
    df["similarity"] = df["similarity"].apply(lambda x : "%.2f"%x)
    df = df.rename(columns={"similarity":case})
    result_df = pd.merge(result_df, df, on="tour_count", how="outer")
result_df

Unnamed: 0,tour_count,both,first,last,neither
0,"(0, 20)",0.69,0.33,0.32,0.17
1,"(1, 20)",0.75,0.46,0.39,0.19
2,"(2, 20)",0.78,0.6,0.55,0.21
3,"(3, 20)",0.81,0.67,0.62,0.22
4,"(4, 20)",0.79,0.71,0.67,0.24
5,"(5, 20)",0.76,0.73,0.7,0.25
6,"(6, 20)",0.74,0.72,0.69,0.25
7,"(7, 20)",0.7,0.68,0.67,0.24
8,"(8, 20)",0.67,0.64,0.63,0.24
9,"(9, 20)",0.64,0.59,0.59,0.23
