# Data Load


In [42]:
import numpy as np
import pandas as pd

In [43]:
data = pd.read_csv("../input/kwproja/kwproja_data.csv")

data

# EDA

153395 rows X 31 columns

- longtitude : 경도, latitude : 위도 
- weekday0~6 : 일 ~ 월
- time -> 05_10 / 10_14 / 14_18 / 18_20 / 20_22 / 22_24 / 24_05 => 시간대가 동일하지 않음
- female/male : 20/30/40/50 -> 8 columns
- shop_type_big -> 14 category
- shop_type_small -> 367 category
- date -> 12 category, 201702~ 201801 까지의 data
- shop_code -> 식별자 feature, input feature로는 사용하지 않지만 분류를 위해서는 사용할 수 있을 것 같음  
- shop_name -> input feature로는 사용하지 않음 
- monthly_gain / avearge_sale_price = 한달 총 판매수

##### monthly_gain과 average_sale_price 중 어느 것을 y값으로 둘 것인가? : y_label 비교해보아야 할 것 같음 
##### shop_code와 관련해서 input feature에 분류해서 넣어야 하는가? 상관없는가? 

In [44]:
data.head()

In [45]:
data.columns

In [46]:
data.dtypes

In [47]:
# 13633 
# 13352
# shop_name과 shop_code가 항상 같다면, 위 두 숫자가 같아야 하는데 약 300개의 data가 차이가 남 
# shop_code는 동일하지만(가게는 동일하지만) 업종, 가게명이 바뀌었을 수도 있음 -> shop_code를 식별자 feature로 사용
print(data['shop_name'].nunique())
print(data['shop_code'].nunique())
print(data['longitude'].nunique())
print(data['latitude'].nunique())
print(data['shop_type_big'].nunique())
print(data['shop_type_small'].nunique())
print(data['date'].unique())

In [48]:
# shop_code는 동일하지만 shop_name이 바뀌는 것을 확인
# 보통은 13(12)개의 data가 있는 반면 이렇게 name이 바뀌는 경우 row data 수가 36개까지 증가하는 것으로 보임
data[data['shop_code'] == 68608110104]

In [49]:
data['monthly_gain'].value_counts()

## Graph

In [50]:
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib import rc
# %matplotlib inline
# plt.rcParams['axes.unicode_minus'] = False
# font_path = "C:/Windows/Fonts/NGULIM.TTF"
# font = fm.FontProperties(fname=font_path).get_name()
# rc('font', family=font)


In [51]:
def view_price_hist(label) : 
    plt.hist(data[label])
    plt.title('price hist')
    plt.show()

def view_price_graph(label) :
    plt.subplot(211)
    plt.plot(data[label])
    plt.show()

In [52]:
view_price_graph('monthly_gain')

In [53]:
view_price_hist('monthly_gain')

In [54]:
ax = plt.subplots()
ax = sns.distplot(data['monthly_gain'], hist=False)
ax.set_title('Total Gain Density')
ax.set_xlabel('Monthly Gain')
ax.set_ylabel('Unit Probability')

In [55]:
plt.figure(figsize=(12,8))
sns.boxplot(data=data['monthly_gain'], color='red')
plt.show()

# Preprocessing
- shop_code, shop_name : 식별자 feature 이므로 drop 
- shop_type_big, shop_type_small : label encodding
- longitude, latitude : 일단은 input_feature에 넣지만 중복값이 많아 보이므로 추후에 제거해보려 함 
- monthly_gain : 결측치 제거 (0값, 3605개로 계산됨)
- MinMaxSaclar 정규화 -> 정규화 column의 범위는??
- date : drop, (그러나 RNN, LSTM과 같은 DL 모델에서는 넣어야 할지도..? 시계열 데이터이므로 쓸 수도 없을지도)


In [56]:
# 원본 data와 따로 관리 -> original data = data, input data = input_data 
# feature drop
input_data = data.copy()

input_data = input_data.drop(['date', 'shop_code', 'shop_name'], axis=1)
input_data

In [57]:
# object type data 확인
cat_cols = input_data.columns[input_data.dtypes == object]
cat_cols

In [58]:
from sklearn.preprocessing import LabelEncoder

# Object Type data -> 숫자 Type Data

le = LabelEncoder()

for i in cat_cols:
    input_data[i] = le.fit_transform(list(input_data[i]))

#input_data
# input_data['shop_type_big'].unique()
# input_data['shop_type_small'].unique()
input_data

In [59]:
input_data['shop_type_big'].value_counts()

In [60]:
# 결측치 제거 
drop_index = input_data[input_data['monthly_gain']==0].index 
input_data = input_data.drop(drop_index)
input_data

In [61]:
# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scale_cols = input_data.columns.tolist()
scaled_data = scaler.fit_transform(input_data[scale_cols])
scaled_data = pd.DataFrame(scaled_data)
scaled_data.columns = scale_cols 

input_data = scaled_data
input_data

In [62]:
ax = plt.subplots()
ax = sns.distplot(input_data['monthly_gain'], hist=False)
ax.set_title('Total Gain Density')
ax.set_xlabel('Monthly Gain')
ax.set_ylabel('Unit Probability')
print(input_data['monthly_gain'].value_counts())

In [63]:
plt.figure(figsize=(12,8))
sns.boxplot(data=input_data['monthly_gain'], color='red')
plt.show()

# Data Split

In [64]:
input_data_y = input_data['monthly_gain'].copy()
input_data_X = input_data.drop(['monthly_gain'], axis=1)

In [65]:
from sklearn.model_selection import train_test_split

tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    input_data_X, 
    input_data_y, 
    test_size = 0.2,      
    shuffle=True,         
    random_state=42)      

train_X, valid_X, train_y, valid_y = train_test_split(
    tr_val_X, 
    tr_val_y, 
    test_size = 0.2,      
    shuffle=True,         
    random_state=42)   

# Modeling
- XGB
- LGBM

In [66]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold, TimeSeriesSplit,GridSearchCV
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor
from lightgbm import plot_importance 
from xgboost import XGBRegressor
from xgboost import plot_importance

from keras.callbacks import EarlyStopping, ModelCheckpoint

In [67]:
model_xgb = XGBRegressor(n_estimators=500, learning_rate=0.05)

model_xgb.fit(train_X, train_y, early_stopping_rounds=100, eval_metric="logloss", eval_set=[(valid_X, valid_y)],verbose=True)
pred_xgb = model_xgb.predict(test_X)

In [68]:
model_lgbm = LGBMRegressor(n_estimators=500, learning_rate=0.05)

model_lgbm.fit(train_X, train_y, early_stopping_rounds=100, eval_metric="logloss", eval_set=[(valid_X, valid_y)], verbose=True)
pred_lgbm = model_lgbm.predict(test_X)

## Prediction

In [69]:
def plot_history(history):
    #hist = pd.DataFrame(history.history)
    #history['epoch'] = history.epoch
    
    plt.figure(figsize=(8,12))
    
    # Mean Abs Error : 평균 절대 오차, 측정값에서 오차의 크기로 측정값과 실제값과의 차이, 절대 오차의 평균  
    # -> 측정하고자 하는 값을 정확하게 측정하지 못함으로써 발생
    plt.subplot(2,1,1)
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error')
    plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
    plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
    plt.legend()
    
    # Mean Square Error : 평균 제곱 오차, 오차의 제복에 대한 평균을 취한 값
    plt.subplot(2,1,2)
    plt.xlabel('Epoch')
    plt.ylabel('Mean Square Error')
    plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
    plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
    plt.legend()
    plt.show()
    
def show_pred(test_y, pred) :
    true_y = test_y.to_numpy()
    true_y = np.ravel(true_y)
    
    df_result = pd.DataFrame(list(zip(true_y, pred)), columns=['true_y', 'prediction'])
    return df_result

def show_mse_rmse(test_y, pred) :
    mse = mean_squared_error(test_y, pred)
    print("mse : %f" % mse)
    
    rmse = np.sqrt(mse)
    print("rmse: %f \n" %rmse)
    
def show_prediction_error(test_y, pred) :
    true_y = test_y.to_numpy()
    true_y = np.ravel(true_y)
    error = pred - true_y
    plt.hist(error, bins=25)
    plt.xlabel("Prediction Error")
    _ = plt.ylabel("Count")
    
# def feature_importance(model_xgb) : 
#     %matplotlib inline
#     plt.rcParams['axes.unicode_minus'] = False
#     font_path = "C:/Windows/Fonts/NGULIM.TTF"
#     font = fm.FontProperties(fname=font_path).get_name()
#     rc('font', family=font)

    fig, ax = plt.subplots(figsize=(10,12))
    plot_importance(model_xgb, ax=ax)
    
def graph(pred, test_label) :
    plt.figure(figsize=(16, 9))
    plt.plot(test_label, label = 'actual')
    plt.plot(pred, label = 'prediction')
    plt.legend()
    plt.show()

In [70]:
show_pred(test_y, pred_xgb)

In [71]:
show_pred(test_y, pred_lgbm)

In [72]:
show_prediction_error(test_y, pred_xgb)

In [73]:
show_prediction_error(test_y, pred_lgbm)

In [74]:
show_mse_rmse(test_y, pred_xgb)

In [75]:
show_mse_rmse(test_y, pred_lgbm)

In [76]:
true_y = test_y.to_numpy()
graph(pred_xgb, true_y)

In [77]:
true_y = test_y.to_numpy()
graph(pred_lgbm, true_y)

In [78]:
feature_series = pd.Series(data=model_xgb.feature_importances_, index=train_X.columns)
feature_series = feature_series.sort_values(ascending=False) 
sns.barplot(x = feature_series, y=feature_series.index)

In [79]:
feature_series = pd.Series(data=model_lgbm.feature_importances_, index=train_X.columns)
feature_series = feature_series.sort_values(ascending=False) 
sns.barplot(x = feature_series, y=feature_series.index)