# 1. Data Load

In [1]:
import numpy as np
import pandas as pd
import math

from scipy import stats #Analysis 
from scipy.stats import norm 

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

In [2]:
# 한글 폰트가 깨지면 실행하세요! 
import matplotlib.font_manager as fm
fontlist = fm.findSystemFonts(fontpaths = None, fontext='ttf')

from matplotlib import font_manager, rc

font_path="C:/Windows/Fonts/H2HDRM.TTF"
font=font_manager.FontProperties(fname=font_path).get_name()
rc('font',family=font)

In [3]:
# data load
data = pd.read_csv("C:/Users/sohee/Desktop/KW/산학연계(졸작)/산학졸작_openUP_Data/kwproja_data_big.csv",encoding='utf-8')

# EDA 

2,927,739 rows X 9 columns

- 매장 속성 정보
  - shop_code -> 식별자 feature, input feature로는 사용하지 않지만 분류를 위해서는 사용할 수 있을 것 같음  
  - shop_name -> input feature로는 사용하지 않음 (NLP deep learning 가능성 있음)
  - longtitude : 경도, latitude : 위도 -> 매장 위치 (회사 근처, 학교 근처 등 매출 영향성 있음) -> 군집화, labeling 필요
  - shop_type_big -> 15 category  -> 업종 (매출 영향성 있음)
  - shop_type_small -> 61 category

- 매출 정보
  - date -> 24 category, 201606~ 201805 까지의 data
  - monthly_gain / avearge_sale_price = 한달 총 판매수

- 매출 통계 정보-> X


##### monthly_gain과 average_sale_price 중 어느 것을 y값으로 둘 것인가? 
- 월매출 예측 문제로 가정하고 montly_gain 을 y값으로 예측하는 모델 만들기

##### shop_code는 input feature에 넣어야 하는가?
- 특별한 브랜드가 y값을 결정하는 과적합 요소가 될 수 있으므로
- X 에서 shop code, shop name 제외하는 것도 방법
- 어느 위치에 어떤 업종으로 어떤 객단가인 매장을 오픈하면 월매출이 어떻게 될까? 문제
 - X: shop type big, shop type small, longitude, latitude, avg_sale_price, 
 - y: montly_gain

In [None]:
data.columns

In [None]:
data.info()

# 3. Preprocessing
data -> processed_data
- 9 columns : **shop_code | date | shop_name | shop_type_big | shop_type_small | longitude | latitude | average_sale_price | monthly_gain**
- shop_code : 식별자 feature 이므로 drop 
- date : 아직은 쓸 수 없으므로 drop
- missing value 제거 : A/N
- shop_type_big(15), shop_type_small(61) 
    - ML : label encodding
    - DL : NLP
- longitude, latitude : 
    - ML : k-mean clustering -> geo column 
    - DL : NLP, reverse geo encoder(행정동, 법정동, 지번주소, 도로명주소) -> 지번주소 가져오세요(for web) 
    - 행정동admcode, 법정동legalcode -> area1, area2, area3, area4
    - 지번 주소addr -> area1, area2, area3, area4 (x), land -> namber1, number2
    - 도로명 주소roadaddr -> area1, area2, area3, area4(x), land -> number1, number2, name  
- average_sale_price 
    - log transformation 
- MinMaxSaclar 정규화 -> 정규화 column의 범위는?? 실험필요 요인 

In [None]:
# 원본 data와 따로 관리 -> original data = data, preprocessed data = processed_data 
# feature drop : date, shop_code
processed_data = data.drop(['date', 'shop_code'], axis=1).copy()

In [None]:
# There are no missing values 
# missing value drop - monthly_gain
null_index = processed_data[processed_data['monthly_gain']==0].index 
print("monthly gain null : ", len(null_index))
processed_data = processed_data.drop(null_index)

# missing value drop - average_sale_price
null_index = processed_data[processed_data['average_sale_price']==0].index 
print("average sale price null : ", len(null_index))
processed_data = processed_data.drop(null_index)

# missing value drop - shop_type_big
null_index = processed_data[processed_data['shop_type_big'].isnull()==True].index
print("shop type big null : ", len(null_index))
print("shop type big unique : ", processed_data['shop_type_big'].nunique())
processed_data = processed_data.drop(null_index)

# missing value drop - shop_type_small 
null_index = processed_data[processed_data['shop_type_small'].isnull()==True].index
print("shop type small null : ", len(null_index))
print("shop type small unique : ", processed_data['shop_type_small'].nunique() )

In [None]:
# LabelEncoder
from sklearn.preprocessing import LabelEncoder

# fit transform으로 한번에 처리 가능
le = LabelEncoder()
processed_data['shop_type_big_label'] = le.fit_transform(list(processed_data['shop_type_big']))   
print(le.classes_)

le = LabelEncoder()
processed_data['shop_type_small_label'] = le.fit_transform(list(processed_data['shop_type_small'])) 
print(le.classes_)

# NLP용 preprocessing 
# shop_name, shop_type_big, shop_type_small = concat_text 
processed_data['concat_text'] = processed_data['shop_name'] + " " + processed_data['shop_type_big'] + " " + processed_data['shop_type_small']

In [None]:
# BinaryEncoder for categorical variable 
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=["shop_type_big", "shop_type_small"])
df = encoder.fit_transform(processed_data[["shop_type_big", "shop_type_small"]])

processed_data = pd.concat([processed_data, df], axis=1)

In [None]:
# Labeling - KMeans Clustering 
# longitude + latitude = geo 
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=9).fit(processed_data[['latitude', 'longitude']])
print(kmeans.cluster_centers_)
print(kmeans.labels_)

processed_data['geo'] = kmeans.labels_

# plotting geo
sns.scatterplot(x='longitude' , y='latitude', hue="geo", data=processed_data, palette="Paired")
plt.title('k-mean')

# NLP 처리를 위해서 featrue drop은 생략합니다! 
#processed_data = processed_data.drop(['longitude', 'latitude'], axis=1)

In [None]:
# average_sale_price -> skewed data 
# log transfromation 
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, Normalizer

scale_cols = ['average_sale_price']
processed_data[scale_cols] = processed_data[scale_cols].apply(lambda x : np.log1p(x))

processed_data

In [None]:
# removing outlier 
# upper fence, lower fence 외 값(outlier)를 제거합니다 
def get_pricelist(i, data) :
    df = data[['shop_type_big_label','monthly_gain']].groupby('shop_type_big_label')
    Q1 = df.get_group(i)['monthly_gain'].quantile(0.25)
    Q2 = df.get_group(i)['monthly_gain'].quantile(0.5)
    Q3 = df.get_group(i)['monthly_gain'].quantile(0.75)
    IQR = Q3 - Q1
    lower_fence = Q1 - (1.5 * IQR)
    upper_fence = Q3 + (1.5 * IQR)
    if lower_fence <= 0 : lower_fence = 0
        
    return lower_fence, Q1, Q2, Q3, upper_fence

def remove_outlier(data) :
    output_data = data.copy()
    for i in range(0,15) :
        lower_fence, Q1, Q2, Q3, upper_fence = get_pricelist(i, data)
        shoptype_index = data[data.shop_type_big_label == i].index
        shoptype_data = data.iloc[shoptype_index, :]
        outlier_index = shoptype_data[shoptype_data.monthly_gain > upper_fence].index
        print("removed index in shop_type_big" , i, ": ", len(outlier_index))
        output_data = output_data.drop(outlier_index)
    return output_data 

processed_data = remove_outlier(processed_data)
processed_data

# 4. 타겟변수 확인
why(or when) to use log transform in ML? 
- target variable이 non-negative values 일때만 
- outlier 값들도 사용해야 하는 경우, outliers that can't be filtered out as they are important to the model.
- 현재 주어진 data도 좌측으로 치우쳐진 (right skewed) 형태, 굉장히 극소수의 업종들만이 굉장히 큰 매출을 만들어낼 수 있는 것으로 보임 
- 어떤 column, feature가 가장 monthly_gain과 상관관계가 높을까요? 
- kaggle house price prediction 대회에서도 RMSE가 아닌 RMSLE를 사용함 -> log를 씌운 형태인데 target variable인 집값의 범위가 넒기 때문

- Skewness: The longer the right tail, the more positive the tail
- Kurtosis (kurtosis / kurtosis): If the kurtosis value (K) is close to 3, the scatter is close to the normal distribution. (K <3), the distributions can be judged to be flattened more smoothly than the normal distribution, and if the kurtosis is a positive number larger than 3 (K> 3), the distribution can be considered to be a more pointed distribution than the normal distribution

In [None]:
# pandas option 설정 하는 코드
# monthly_gain의 경우 부동소수점으로 나타나서 보기 어려울땐 윗 줄의 주석을 제거하고 아래에 주석을 추가하고
# 다시 원래대로 돌리고 싶다면 아래에 주석제거, 위 코드에 주석추가
pd.options.display.float_format = '{:.2f}'.format
#pd.reset_option('display.float_format')

# 5. Data Split 
전처리 완료, 필요한 column을 input으로 넣고 train / valid / test data split 

In [None]:
# 전처리된 data와 따로 관리 -> processed_data, model input data = input_data 
input_data = processed_data.copy()

input_data_X = input_data.drop(['monthly_gain'],axis=1)
input_data_y = input_data['monthly_gain'].copy()

In [None]:
input_data_X

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV

# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    input_data_X, 
    input_data_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True
)

# valid/train 로 split
train_X, valid_X, train_y, valid_y = train_test_split(
    tr_val_X, 
    tr_val_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True
)

# 6. Modeling

In [4]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from keras.callbacks import EarlyStopping, ModelCheckpoint

from lightgbm import LGBMRegressor
from lightgbm import plot_importance 
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.linear_model import Ridge,Lasso

In [7]:
# ml용 data에는 _ml을 붙여줍니다 
train_X_ml = train_X.drop(['shop_name', 'shop_type_big', 'shop_type_small', 'longitude', 
                           'latitude', 'concat_text', 'shop_type_big_label', 'shop_type_small_label'], axis=1).copy()
valid_X_ml = valid_X.drop(['shop_name', 'shop_type_big', 'shop_type_small', 'longitude', 
                           'latitude', 'concat_text', 'shop_type_big_label', 'shop_type_small_label'], axis=1).copy()
test_X_ml = test_X.drop(['shop_name', 'shop_type_big', 'shop_type_small', 'longitude',
                         'shop_type_big_label', 'shop_type_small_label', 'latitude', 'concat_text'], axis=1).copy()

#print(len(train_X))
#print(len(train_X_ml))
#print(len(valid_X))
#print(len(valid_X_ml))
#print(len(test_X))
#print(len(test_X_ml))

#### XGB

In [None]:
model_xgb = XGBRegressor()

In [None]:
model_xgb.fit(train_X_ml,train_y,eval_set=[(valid_X_ml,valid_y)])

In [None]:
pred_xgb = model_xgb.predict(test_X_ml)
pred_xgb

#### LGBM

In [None]:
model_lgbm = LGBMRegressor()

In [None]:
model_lgbm.fit(train_X_ml,train_y,eval_set=[(valid_X_ml,valid_y)])

In [None]:
pred_lgbm = model_lgbm.predict(test_X_ml)
pred_lgbm

#### Ridge

In [None]:
# RIdge 모델 (parameter 적용)
from sklearn.linear_model import RidgeCV #parameter를 넣어준다는거에서 ridge랑 다름

alphas = [0, 0.001, 0.01, 0.1, 1]

# RidgeCV는 alpha로 넣고자 하는 값들을 리스트로 전달하면 내부적으로 최적의 alpha값을 찾아냄
ridgecv = RidgeCV(alphas=alphas, normalize=True, cv=5)
# cv : cross-validation -> 데이터를 k등분한 후 각각에 대하여 검증 진행
# 검증 결과 가장 점수가 높은 모델을 채택
ridgecv.fit(train_X_ml, train_y)
ridgecv_pred = ridgecv.predict(test_X_ml)

mae = mean_absolute_error(test_y, ridgecv_pred)
r2 = r2_score(test_y, ridgecv_pred)
print(f'Test MAE: ${mae:,.0f}')
print(f'R2 Score: {r2:,.4f}\n')

print(f'alpha: {ridgecv.alpha_}') # 최종 결정된 alpha값
print(f'cv best score: {ridgecv.best_score_}') # 최종 alpha에서의 점수(R^2 of self.predict(X) wrt. y.)

In [None]:
#위의 alpha값 넣어준 후 학습 진행하기
model_ridge=Ridge(alpha=0.01)

In [None]:
model_ridge.fit(train_X_ml,train_y),eval_set=[(valid_X_ml,valid_y)])

In [None]:
pred_ridge = model_ridge.predict(test_X_ml)
pred_ridge

#### Lasso

In [None]:
# Parameter 튜닝시도

In [None]:
model_lasso=Lasso()
model_lasso.fit(train_X_ml,train_y),eval_set=[(valid_X_ml,valid_y)])
print(model_lasso.score(train_X_ml,train_y))
print(model_lasso.score(test_X_ml,test_y))

In [None]:
train_score=[]
test_score=[]
alpha_list=[0.001,0.01,0.1,1,10,100]
for alpha in alpha_list:
    model_lasso=Lasso(alpha=alpha,max_iter=10000)
    model_lasso.fit(train_X_ml,train_y)
    train_score.append(model_lasso.score(train_X_ml,train_y))
    test_score.append(model_lasso.score(test_X_ml,test_y))
plt.plot(np.log10(alpha_list),train_score)
plt.plot(np.log10(alpha_list),test_score)
plt.show()

In [None]:
#test1
lasso1= Lasso(alpha=0.01, max_iter=100000).fit(train_X_ml, train_y)

print("훈련 세트의 정확도 : {:.2f}".format(lasso1.score(train_X_ml, train_y)))

print("테스트 세트의 정확도 : {:.2f}".format(lasso1.score(test_X_ml, test_y)))

print("사용한 특성의 수 : {}".format(np.sum(lasso1.coef_ != 0)))

print("사용한 max_iter : {}".format(lasso1.n_iter_))

In [None]:
# --> 차이점이 보이지 않아 default값으로 구현
model_lasso=Lasso()

In [None]:
model_lasso.fit(train_X_ml,train_y),eval_set=[(valid_X_ml,valid_y)])

In [None]:
pred_lasso = model_lasso.predict(test_X_ml)
pred_lasso

## 7. NLP

In [None]:
from keras.preprocessing.text import Tokenizer

tk = Tokenizer()
tk.fit_on_texts(input_data['concat_text'])

print(list(tk.word_index.items())[:20])
print("\nvocab words 개수 : ", len(tk.word_index.items()))

In [None]:
from keras.preprocessing.sequence import pad_sequences

seq_data = tk.texts_to_sequences(input_data['concat_text'])
print("seq_data[0]: ", seq_data[0])

pad_seq_data = pad_sequences(seq_data)
print("pad_seq_data.shpae: ", pad_seq_data.shape)

nlp_input_length = pad_seq_data[0].shape[0]
print("nlp_input_length", nlp_input_length)

In [None]:
def word_embedding(df, nlp_input_length) :
    seq_data = tk.texts_to_sequences(df)
    pad_seq_data = pad_sequences(seq_data, nlp_input_length)
    word_embedding = pad_seq_data
    return word_embedding

train_X_dl = word_embedding(train_X['concat_text'], nlp_input_length)
valid_X_dl = word_embedding(valid_X['concat_text'], nlp_input_length)
test_X_dl = word_embedding(test_X['concat_text'], nlp_input_length)

print(len(train_X))
print(len(train_X_dl))
print(len(valid_X))
print(len(valid_X_dl))
print(len(test_X))
print(len(test_X_dl))   

In [None]:
from keras import Sequential
from keras.layers import *
import tensorflow as tf
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [None]:
def create_model(input_dim, output_dim, input_length=nlp_input_length) : 
    model = Sequential()
    model.add(Embedding(input_dim, output_dim, input_length = nlp_input_length))    
    model.add(Flatten())
    model.add(Dense(1, activation='relu'))
    model.add(Dense(len(set(input_data_y)), activation='linear'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
input_dim = len(tk.word_index) + 1
output_dim = 10

model_dl = create_model(input_dim, output_dim)
model_dl.summary()

In [None]:
hist = model_dl.fit(train_X_dl, train_y, validation_data=(valid_X_dl, valid_y), epochs=10, batch_size=64)

In [None]:
pred_dl= model_dl.predict(test_X_dl)
pred_dl

## Evaluation

#### prediction을 확인 후 코드 재작성이 필요

In [None]:
from sklearn.metrics import accuracy_score, regression_report, confusion_matrix, f1_score

def show_pred(test_y, pred) :
    true_y = test_y.to_numpy()
    true_y = np.ravel(true_y)
    
    df_result = pd.DataFrame(list(zip(true_y, pred)), columns=['true_y', 'prediction'])
    return df_result

def show_mse_rmse(test_y, pred) :
    mse = mean_squared_error(test_y, pred)
    print("mse : %f" % mse)
    
    rmse = np.sqrt(mse)
    print("rmse: %f \n" %rmse)
    
def show_mae(test_y,pred):    
    mae = mean_absolute_error(test_y, pred)
    print("mae : %f" %mae)

def show_r2_score(test_y, pred, test_X_ml) : 
    r2 = r2_score(pred, test_y)
    print("r2 : %f " % r2)
    adj_r2 = 1 - (1-r2)*(test_X.shape[0]-1)/(test_X_ml.shape[0]-test_X_ml.shape[1]-1)
    print("adj_r2_score : %f \n" % adj_r2)


In [None]:
# XGB
show_pred(test_y, pred_dl)

In [None]:
show_mse_rmse(test_y, pred_dl)
show_mae(test_y,pred_dl)
show_r2_score(test_y, pred_dl, test_X_ml)