# 1. Data Load

In [None]:
import numpy as np
import pandas as pd

from scipy import stats #Analysis 
from scipy.stats import norm # Analysis
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

In [None]:
import matplotlib.font_manager as fm
fontlist = fm.findSystemFonts(fontpaths = None, fontext='ttf')

from matplotlib import font_manager, rc

font_path="C:/Windows/Fonts/H2HDRM.TTF"
font=font_manager.FontProperties(fname=font_path).get_name()
rc('font',family=font)

In [None]:
data = pd.read_csv("C:/Users/sohee/Desktop/KW/산학연계(졸작)/산학졸작_openUP_Data/kwproja_data_big.csv",encoding='utf-8')
data.head() # original data -> data

# EDA 

2,927,739 rows X 9 columns

- 매장 속성 정보
  - shop_code -> 식별자 feature, input feature로는 사용하지 않지만 분류를 위해서는 사용할 수 있을 것 같음  
  - shop_name -> input feature로는 사용하지 않음 (NLP deep learning 가능성 있음)
  - longtitude : 경도, latitude : 위도 -> 매장 위치 (회사 근처, 학교 근처 등 매출 영향성 있음) -> 군집화, labeling 필요
  - shop_type_big -> 15 category  -> 업종 (매출 영향성 있음)
  - shop_type_small -> 61 category

- 매출 정보
  - date -> 24 category, 201606~ 201805 까지의 data
  - monthly_gain / avearge_sale_price = 한달 총 판매수

- 매출 통계 정보-> X


##### monthly_gain과 average_sale_price 중 어느 것을 y값으로 둘 것인가? 
- 월매출 예측 문제로 가정하고 montly_gain 을 y값으로 예측하는 모델 만들기

##### shop_code는 input feature에 넣어야 하는가?
- 특별한 브랜드가 y값을 결정하는 과적합 요소가 될 수 있으므로
- X 에서 shop code, shop name 제외하는 것도 방법
- 어느 위치에 어떤 업종으로 어떤 객단가인 매장을 오픈하면 월매출이 어떻게 될까? 문제
 - X: shop type big, shop type small, longitude, latitude, avg_sale_price, 
 - y: montly_gain

In [None]:
data.columns

In [None]:
data.info()

# 3. Preprocessing
- missing value 제거 
    - monthly_gain : 3,605 제거 -> 149,790
    - shop_type_big : 4,303 제거 -> 145,487 
    - gender feature : 1,187 제거 -> 144,300
- shop_code, shop_name : 식별자 feature 이므로 drop 
- date : 아직은 쓸 수 없으므로 drop
    - 여기까지 총 144,300 X 27
- shop_type_big(13), shop_type_small(367) : label encodding
- longitude, latitude : clustering을 통해 labeling 후 해당 두 열은 drop 
- MinMaxSaclar 정규화 -> 정규화 column의 범위는??

In [None]:
# 원본 data와 따로 관리 -> original data = data, input data = input_data 
# feature drop
input_data = data.copy()
input_data = input_data.drop(['date', 'shop_code'], axis=1)

In [None]:
# There are no missing values 
# missing value drop - monthly_gain
null_index = input_data[input_data['monthly_gain']==0].index 
print("monthly gain null : ", len(null_index))
input_data = input_data.drop(null_index)

# missing value drop - average_sale_price
null_index = input_data[input_data['average_sale_price']==0].index 
print("average sale price null : ", len(null_index))
input_data = input_data.drop(null_index)

# missing value drop - shop_type_big
null_index = input_data[input_data['shop_type_big'].isnull()==True].index
print("shop type big null : ", len(null_index))
print("shop type big unique : ", input_data['shop_type_big'].nunique())
input_data = input_data.drop(null_index)

# missing value drop - shop_type_small 
null_index = input_data[input_data['shop_type_small'].isnull()==True].index
print("shop type small null : ", len(null_index))
print("shop type small unique : ", input_data['shop_type_small'].nunique() )

In [None]:
# LabelEncoder
from sklearn.preprocessing import LabelEncoder

# fit transform으로 한번에 처리 가능
le = LabelEncoder()
input_data['shop_type_big_label'] = le.fit_transform(list(input_data['shop_type_big']))   
print(le.classes_)

le = LabelEncoder()
input_data['shop_type_small_label'] = le.fit_transform(list(input_data['shop_type_small'])) 
print(le.classes_)

# NLP용 preprocessing 
# shop_name, shop_type_big, shop_type_small = concat_text 
input_data['concat_text'] = input_data['shop_name'] + " " + input_data['shop_type_big']+" "+input_data['shop_type_small']

In [None]:
# BinaryEncoder for categorical variable 
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=["shop_type_big", "shop_type_small"])
df = encoder.fit_transform(input_data[["shop_type_big", "shop_type_small"]])

input_data = pd.concat([input_data, df], axis=1)

In [None]:
# Labeling - KMeans Clustering 
# longitude + latitude = geo 
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=9).fit(input_data[['latitude', 'longitude']])
print(kmeans.cluster_centers_)
print(kmeans.labels_)

input_data['geo'] = kmeans.labels_

# plotting geo
sns.scatterplot(x='longitude' , y='latitude', hue="geo", data=input_data, palette="Paired")
plt.title('k-mean')

# NLP 처리를 위해서 featrue drop은 생략합니다! 
#input_data = input_data.drop(['longitude', 'latitude'], axis=1)

In [None]:
# average_sale_price -> skewed data 
# log transfromation 
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, Normalizer

scale_cols = ['average_sale_price']
input_data[scale_cols] = input_data[scale_cols].apply(lambda x : np.log1p(x))

input_data

In [None]:
# rcParams['figure.figsize'] = 15,8
# sns.boxplot(x='shop_type_big', y='average_sale_price', data=input_data)

In [None]:
# ax = plt.subplots()
# ax = sns.distplot(input_data['monthly_gain'], hist=False)
# ax.set_title('Total Gain Density')
# ax.set_xlabel('Monthly Gain')
# ax.set_ylabel('Unit Probability')
# print(input_data['monthly_gain'].value_counts())

# 4. 타겟변수 확인
why(or when) to use log transform in ML? 
- target variable이 non-negative values 일때만 
- outlier 값들도 사용해야 하는 경우, outliers that can't be filtered out as they are important to the model.
- 현재 주어진 data도 좌측으로 치우쳐진 (right skewed) 형태, 굉장히 극소수의 업종들만이 굉장히 큰 매출을 만들어낼 수 있는 것으로 보임 
- 어떤 column, feature가 가장 monthly_gain과 상관관계가 높을까요? 
- kaggle house price prediction 대회에서도 RMSE가 아닌 RMSLE를 사용함 -> log를 씌운 형태인데 target variable인 집값의 범위가 넒기 때문

- Skewness: The longer the right tail, the more positive the tail
- Kurtosis (kurtosis / kurtosis): If the kurtosis value (K) is close to 3, the scatter is close to the normal distribution. (K <3), the distributions can be judged to be flattened more smoothly than the normal distribution, and if the kurtosis is a positive number larger than 3 (K> 3), the distribution can be considered to be a more pointed distribution than the normal distribution

In [None]:
# print("Skewness: %f" % input_data['monthly_gain'].skew())
# print("Kurtosis: %f" % input_data['monthly_gain'].kurt())

In [None]:
# plots=pd.DataFrame()
# plots['original']=input_data['monthly_gain']
# plots['transformed']=np.log1p(input_data['monthly_gain'])
# plots['backToOriginal']=np.expm1(np.log1p(input_data['monthly_gain']))

# fig, ax = plt.subplots(1,3,figsize=(15,5))
# sns.distplot(plots['original'], ax=ax[0]);
# sns.distplot(plots['transformed'], ax=ax[1]);
# sns.distplot(plots['backToOriginal'], ax=ax[2]);

In [None]:
# fig = plt.figure(figsize = (15,10))

# fig.add_subplot(1,2,1)
# res = stats.probplot(input_data['monthly_gain'], plot=plt)

# fig.add_subplot(1,2,2)
# res = stats.probplot(np.log1p(input_data['monthly_gain']), plot=plt)

# 5. Data split

In [None]:
# pandas option 설정 하는 코드
# monthly_gain의 경우 부동소수점으로 나타나서 보기 어려울땐 윗 줄의 주석을 제거하고 아래에 주석을 추가하고
# 다시 원래대로 돌리고 싶다면 아래에 주석제거, 위에 주석추가
pd.options.display.float_format = '{:.2f}'.format
#pd.reset_option('display.float_format')

input_data.head()

In [None]:
group_big_gain = input_data[['shop_type_big_label','monthly_gain']].groupby('shop_type_big_label')
df_group_big_gain = group_big_gain.describe().droplevel(axis=1,level=0)

# monthly_gain을 shop_type_big 분야별로 정렬하여 출력
print("내림차순 기준 분야별로 정렬하여 출력합니다 - monthly_gain")
print("min: ", list(df_group_big_gain.sort_values(by=['min'], ascending=False).index))
print("mean:", list(df_group_big_gain.sort_values(by=['mean'], ascending=False).index))
print("max: ", list(df_group_big_gain.sort_values(by=['max'], ascending=False).index))

df_group_big_gain

In [None]:
input_data_y = input_data['monthly_gain'].copy()
input_data_X = input_data.drop(['monthly_gain'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV

# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    input_data_X, 
    input_data_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True
)

# valid/train 로 split
train_X, valid_X, train_y, valid_y = train_test_split(
    tr_val_X, 
    tr_val_y, 
    test_size = 0.2, 
    random_state = 42,
    shuffle=True
)

# 6. Modeling
- XGB
    - 타겟변수 정규화 실행 
- LGBM
    - loss parameter : tweedie 
- Lasso
- Ridge

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, KFold, TimeSeriesSplit,GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from keras.callbacks import EarlyStopping, ModelCheckpoint

from lightgbm import LGBMRegressor
from lightgbm import plot_importance 
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.linear_model import Ridge

In [None]:
# ml용 data에는 _ml을 붙여줍니다 
train_X_ml = train_X.drop(['shop_name', 'shop_type_big', 'shop_type_small', 'longitude', 
                           'latitude', 'concat_text', 'shop_type_big_label', 'shop_type_small_label'], axis=1).copy()
valid_X_ml = valid_X.drop(['shop_name', 'shop_type_big', 'shop_type_small', 'longitude', 
                           'latitude', 'concat_text', 'shop_type_big_label', 'shop_type_small_label'], axis=1).copy()
test_X_ml = test_X.drop(['shop_name', 'shop_type_big', 'shop_type_small', 'longitude',
                         'shop_type_big_label', 'shop_type_small_label', 'latitude', 'concat_text'], axis=1).copy()

print(len(train_X))
print(len(train_X_ml))
print(len(valid_X))
print(len(valid_X_ml))
print(len(test_X))
print(len(test_X_ml))

In [None]:
train_X_ml

In [None]:
# XGB, LGBM
model_xgb = XGBRegressor()
#model_lgbm = LGBMRegressor()

In [None]:
# RIdge 모델 (parameter 적용)
from sklearn.linear_model import RidgeCV #parameter를 넣어준다는거에서 ridge랑 다름

alphas = [0, 0.001, 0.01, 0.1, 1]

# RidgeCV는 alpha로 넣고자 하는 값들을 리스트로 전달하면 내부적으로 최적의 alpha값을 찾아냄
ridgecv = RidgeCV(alphas=alphas, normalize=True, cv=5)
# cv : cross-validation -> 데이터를 k등분한 후 각각에 대하여 검증 진행
# 검증 결과 가장 점수가 높은 모델을 채택
ridgecv.fit(train_X_ml, train_y)
ridgecv_pred = ridgecv.predict(test_X_ml)

mae = mean_absolute_error(test_y, ridgecv_pred)
r2 = r2_score(test_y, ridgecv_pred)
print(f'Test MAE: ${mae:,.0f}')
print(f'R2 Score: {r2:,.4f}\n')

print(f'alpha: {ridgecv.alpha_}') # 최종 결정된 alpha값
print(f'cv best score: {ridgecv.best_score_}') # 최종 alpha에서의 점수(R^2 of self.predict(X) wrt. y.)

In [None]:
# #위의 alpha값 넣어준 후 학습 진행하기
# model_ridge=Ridge(alpha=0.01)

In [None]:
# model_ridge.fit(train_X_ml,train_y),eval_set=[(valid_X_ml,valid_y)])

In [None]:
model_xgb.fit(train_X_ml,train_y,eval_set=[(valid_X_ml,valid_y)])

In [None]:
pred_xgb = model_xgb.predict(test_X_ml)
pred_xgb
#pred_lgbm = model_lgbm.predict(test_X_ml)
#pred_ridge=model_ridge.predict(test_X_ml)

## DL, NLP

In [None]:
from keras.preprocessing.text import Tokenizer

tk = Tokenizer()
tk.fit_on_texts(input_data['concat_text'])

print(list(tk.word_index.items())[:20])
print("\nvocab words 개수 : ", len(tk.word_index.items()))

In [None]:
# test for nlp input length 
# shop_name, shop_type_big, shop_type_small, text_concat 중 가장  긴 input length 찾아야 함 
from keras.preprocessing.sequence import pad_sequences

seq_data = tk.texts_to_sequences(input_data['concat_text'])
print("seq_data[0]: ", seq_data[0])

pad_seq_data = pad_sequences(seq_data)
print("pad_seq_data.shpae: ", pad_seq_data.shape)

nlp_input_length = pad_seq_data[0].shape[0]
print("nlp_input_length", nlp_input_length)

In [None]:
#pad_seq_data

In [None]:
def word_embedding(df, nlp_input_length) :
    seq_data = tk.texts_to_sequences(df)
    pad_seq_data = pad_sequences(seq_data, nlp_input_length)
    word_embedding = pad_seq_data
    return word_embedding

train_X_dl = word_embedding(train_X['concat_text'], nlp_input_length)
valid_X_dl = word_embedding(valid_X['concat_text'], nlp_input_length)
test_X_dl = word_embedding(test_X['concat_text'], nlp_input_length)

print(len(train_X))
print(len(train_X_dl))
print(len(valid_X))
print(len(valid_X_dl))
print(len(test_X))
print(len(test_X_dl))    

In [None]:
from keras import Sequential
from keras.layers import *
import tensorflow as tf
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

def create_model(input_dim, output_dim, input_length=nlp_input_length) : 
    model = Sequential()
    # 1 워드 임베딩 학습 
    model.add(Embedding(input_dim, output_dim, input_length = nlp_input_length))
    
    # Classification 학습 
    model.add(Flatten())
    model.add(Dense(1, activation='relu'))
    model.add(Dense(len(set(input_data_y)), activation='linear'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


input_dim = len(tk.word_index) + 1 # vocab size + padding index =114,537 + 1 = 114,538
output_dim = 10

model_dl = create_model(input_dim, output_dim)
model_dl.summary()

In [None]:
hist = model_dl.fit(train_X_dl, train_y, validation_data=(valid_X_dl, valid_y), epochs=10, batch_size=64)

In [None]:
# import matplotlib.pyplot as plt
# plt.plot(hist.history['loss'], label='loss')
# plt.plot(hist.history['val_loss'], label='test loss')
# plt.xticks(range(len(hist.history['loss'])))
# plt.xlabel('epoch')
# plt.ylabel('precision') 
# plt.legend(loc='lower right')
# plt.grid(True)
# plt.show()

In [None]:
pred_dl= model_dl.predict(test_X_dl)
pred_dl

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, regression_report, confusion_matrix, f1_score

def show_pred(test_y, pred) :
    true_y = test_y.to_numpy()
    true_y = np.ravel(true_y)
    
    df_result = pd.DataFrame(list(zip(true_y, pred)), columns=['true_y', 'prediction'])
    return df_result
    
def show_prediction_error(test_y, pred) :
    true_y = test_y.to_numpy()
    true_y = np.ravel(true_y)
    error = pred - true_y
    plt.hist(error, bins=25)
    plt.xlabel("Prediction Error")
    _ = plt.ylabel("Count")
    
def feature_importance(model_xgb) : 
    %matplotlib inline
    plt.rcParams['axes.unicode_minus'] = False
    font_path = "C:/Windows/Fonts/NGULIM.TTF"
    font = fm.FontProperties(fname=font_path).get_name()
    rc('font', family=font)

    fig, ax = plt.subplots(figsize=(10,12))
    plot_importance(model_xgb, ax=ax)
    
def graph(pred, test_label) :
    plt.figure(figsize=(16, 9))
    plt.plot(test_label, label = 'actual')
    plt.plot(pred, label = 'prediction')
    plt.legend()
    plt.show()

In [None]:
from sklearn.metrics import regression_report
print(regression_report(test_y, pred_dl))

In [None]:
#show_pred(test_y, pred_xgb)

In [None]:
#show_pred(test_y, pred_lgbm)

In [None]:
#show_pred(test_y, pred_ridge)

In [None]:
# show_mse_rmse(test_y, pred_xgb)
# show_r2_score(test_y, pred_xgb, test_X_ml)
# show_mae(test_y,pred_xgb)

In [None]:
# show_mse_rmse(test_y, pred_lgbm)
# show_r2_score(test_y, pred_lgbm, test_X_ml)
# show_mae(test_y,pred_lgbm)

In [None]:
# show_mse_rmse(test_y, pred_ridge)
# show_r2_score(test_y, pred_ridge, test_X_ml)
# show_mae(test_y,pred_ridge)