# KNN_SYVM_프로모션_효율예측분석

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns

In [67]:
mem = pd.read_csv("./data/member.csv")
tran = pd.read_csv("./data/transaction.csv")

* 쇼핑몰 데이터, 프로모션 쿠폰을 발행하고 사용 여부 데이터 수집
* mem: 고객 id, 최근 방문일, 사는 지역, 추천여부, 주요접속채널, 쿠폰 사용 여부(target)
* tran : 고객id, 구매수량, 총 구매 금액
* 전통적 마케팅 분석방법 RFM 기법을 활용해 고객 데이터에서 파생변수 생성 후 분석
* R: Recency: 현재일 - 최근 구매일
* F: Frequency : 구매빈도
* M: Monetary: 구매 금액
* 종속변수: conversion => 고객이 프로모션에 반응 했는가? 1=yes, 0=no

In [30]:
mem.head(2)

Unnamed: 0,id,recency,zip_code,is_referral,channel,conversion
0,906145,10,Surburban,0,Phone,0
1,184478,6,Rural,1,Web,0


In [31]:
tran.head(20)

Unnamed: 0,id,num_item,total_amount
0,906145,5,34000
1,906145,1,27000
2,906145,4,33000
3,184478,4,29000
4,394235,4,33000
5,394235,4,8000
6,130152,2,11000
7,130152,1,22000
8,130152,1,23000
9,130152,3,27000


In [5]:
mem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           64000 non-null  int64 
 1   recency      64000 non-null  int64 
 2   zip_code     64000 non-null  object
 3   is_referral  64000 non-null  int64 
 4   channel      64000 non-null  object
 5   conversion   64000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 2.9+ MB


In [6]:
tran.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196836 entries, 0 to 196835
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   id            196836 non-null  int64
 1   num_item      196836 non-null  int64
 2   total_amount  196836 non-null  int64
dtypes: int64(3)
memory usage: 4.5 MB


In [32]:
tran.loc[tran['id'] == 100001]

Unnamed: 0,id,num_item,total_amount
7456,100001,3,24000
7457,100001,4,28000


In [53]:
tran.value_counts()

id      num_item  total_amount
388503  1         16000           3
362633  2         25000           3
837062  1         29000           3
138844  4         17000           3
776672  1         23000           3
                                 ..
550832  3         35000           1
100113  5         26000           1
550832  5         27000           1
999881  4         24000           1
550835  1         28000           1
Name: count, Length: 194954, dtype: int64

In [64]:
total = tran.groupby(['id'])['total_amount'].agg(['sum','count']).reset_index()

In [68]:
total.columns = ['id', 'amt_sum', 'amt_count']

In [70]:
mem = pd.merge(mem, total, how='left', on= 'id')

In [26]:
mem['total_amount'] = 

In [87]:
mem

Unnamed: 0,id,recency,zip_code,is_referral,channel,conversion,amt_sum,amt_count
0,906145,10,Surburban,0,Phone,0,94000,3
1,184478,6,Rural,1,Web,0,29000,1
2,394235,7,Surburban,1,Web,0,41000,2
3,130152,9,Rural,1,Web,0,83000,4
4,940352,2,Urban,0,Web,0,31000,1
...,...,...,...,...,...,...,...,...
63995,838295,10,Urban,0,Web,0,104000,4
63996,547316,5,Urban,1,Phone,0,89000,5
63997,131575,6,Urban,1,Phone,0,61000,2
63998,603659,1,Surburban,1,Multichannel,0,108000,5


In [72]:
mem.loc[mem['id'] == 100001]

Unnamed: 0,id,recency,zip_code,is_referral,channel,conversion,amt_sum,amt_count
2896,100001,6,Urban,1,Phone,0,52000,2


In [89]:
obj_cols = mem[['zip_code','channel']]
num_cols = mem.select_dtypes(exclude=['object', 'category'])
obj_cols

Unnamed: 0,zip_code,channel
0,Surburban,Phone
1,Rural,Web
2,Surburban,Web
3,Rural,Web
4,Urban,Web
...,...,...
63995,Urban,Web
63996,Urban,Phone
63997,Urban,Phone
63998,Surburban,Multichannel


In [90]:
num_cols.columns

Index(['id', 'recency', 'is_referral', 'conversion', 'amt_sum', 'amt_count'], dtype='object')

In [91]:
mem.columns

Index(['id', 'recency', 'zip_code', 'is_referral', 'channel', 'conversion',
       'amt_sum', 'amt_count'],
      dtype='object')

In [86]:
for i in obj_cols:
    print('='*30,i,'='*30)
    print(mem[i].nunique(), mem[i].unique())
    print()
    print(mem[i].value_counts())
    print()
    print(mem.groupby(i)['conversion'].mean().sort_values(ascending=False))

3 ['Surburban' 'Rural' 'Urban']

zip_code
Surburban    28776
Urban        25661
Rural         9563
Name: count, dtype: int64

zip_code
Rural        0.188121
Surburban    0.139943
Urban        0.139044
Name: conversion, dtype: float64
3 ['Phone' 'Web' 'Multichannel']

channel
Web             28217
Phone           28021
Multichannel     7762
Name: count, dtype: int64

channel
Multichannel    0.171734
Web             0.159407
Phone           0.127155
Name: conversion, dtype: float64


In [92]:
for i in num_cols:
    print('='*30,i,'='*30)
    print(mem[i].describe())
    print()
    print(mem[i].value_counts())
    print()
    print(mem.groupby(i)['conversion'].mean().sort_values(ascending=False))

count     64000.000000
mean     550694.137797
std      259105.689773
min      100001.000000
25%      326772.000000
50%      551300.000000
75%      774914.500000
max      999997.000000
Name: id, dtype: float64

id
197686    1
331595    1
448925    1
260497    1
561234    1
         ..
940352    1
130152    1
394235    1
184478    1
906145    1
Name: count, Length: 64000, dtype: int64

id
539439    1.0
941253    1.0
539390    1.0
539371    1.0
539340    1.0
         ... 
551242    0.0
999990    0.0
551265    0.0
999997    0.0
551294    0.0
Name: conversion, Length: 64000, dtype: float64
count    64000.000000
mean         5.763734
std          3.507592
min          1.000000
25%          2.000000
50%          6.000000
75%          9.000000
max         12.000000
Name: recency, dtype: float64

recency
1     8952
10    7565
2     7537
9     6441
3     5904
4     5077
6     4605
5     4510
7     4078
11    3504
8     3495
12    2332
Name: count, dtype: int64

recency
1     0.193029
2     0.177

In [75]:
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


In [76]:
profile = ProfileReport(mem, title="mem_EDA")

In [77]:
profile.to_file("./mem_EDA.html")

Summarize dataset:   0%| | 0/13 [00:00<?, ?it/s, Describe variable: is_referral]
Summarize dataset:  38%|▍| 5/13 [00:00<00:01,  7.42it/s, Describe variable: amt_[A
100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 31.20it/s][A
Summarize dataset: 100%|█████████████| 26/26 [00:02<00:00,  9.63it/s, Completed]
Generate report structure: 100%|██████████████████| 1/1 [00:02<00:00,  2.90s/it]
Render HTML: 100%|████████████████████████████████| 1/1 [00:00<00:00,  1.65it/s]
Export report to file: 100%|█████████████████████| 1/1 [00:00<00:00, 283.38it/s]


In [93]:
mem

Unnamed: 0,id,recency,zip_code,is_referral,channel,conversion,amt_sum,amt_count
0,906145,10,Surburban,0,Phone,0,94000,3
1,184478,6,Rural,1,Web,0,29000,1
2,394235,7,Surburban,1,Web,0,41000,2
3,130152,9,Rural,1,Web,0,83000,4
4,940352,2,Urban,0,Web,0,31000,1
...,...,...,...,...,...,...,...,...
63995,838295,10,Urban,0,Web,0,104000,4
63996,547316,5,Urban,1,Phone,0,89000,5
63997,131575,6,Urban,1,Phone,0,61000,2
63998,603659,1,Surburban,1,Multichannel,0,108000,5


In [94]:
mem.columns

Index(['id', 'recency', 'zip_code', 'is_referral', 'channel', 'conversion',
       'amt_sum', 'amt_count'],
      dtype='object')

In [96]:
mem = pd.get_dummies(mem, drop_first= True)

In [98]:
X = mem.drop('conversion', axis=1)
y= mem['conversion']

In [99]:
X

Unnamed: 0,id,recency,is_referral,amt_sum,amt_count,zip_code_Surburban,zip_code_Urban,channel_Phone,channel_Web
0,906145,10,0,94000,3,True,False,True,False
1,184478,6,1,29000,1,False,False,False,True
2,394235,7,1,41000,2,True,False,False,True
3,130152,9,1,83000,4,False,False,False,True
4,940352,2,0,31000,1,False,True,False,True
...,...,...,...,...,...,...,...,...,...
63995,838295,10,0,104000,4,False,True,False,True
63996,547316,5,1,89000,5,False,True,True,False
63997,131575,6,1,61000,2,False,True,True,False
63998,603659,1,1,108000,5,True,False,False,False


In [101]:
y.value_counts()

conversion
0    54606
1     9394
Name: count, dtype: int64

In [102]:
9394/(9394+54606)*100

14.678125

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score,classification_report

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4 , stratify=y , random_state=10)

In [110]:
dtc = DecisionTreeClassifier( class_weight='balanced', random_state=10 )
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)
print("=" * 30, "valid_result", "=" * 30)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.7743359375
              precision    recall  f1-score   support

           0       0.86      0.87      0.87     21842
           1       0.22      0.21      0.21      3758

    accuracy                           0.77     25600
   macro avg       0.54      0.54      0.54     25600
weighted avg       0.77      0.77      0.77     25600



In [111]:
lxgb = LGBMClassifier( class_weight='balanced',n_jobs=10, random_state=10 )
lxgb.fit(X_train, y_train)
pred = lxgb.predict(X_test)
print("=" * 30, "valid_result", "=" * 30)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

[LightGBM] [Info] Number of positive: 5636, number of negative: 32764
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 38400, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
0.6187109375
              precision    recall  f1-score   support

           0       0.92      0.61      0.73     21842
           1       0.23      0.69      0.35      3758

    accuracy                           0.62     25600
   macro avg       0.58      0.65      0.54     25600
weighted avg       0.82      0.62      0.67     25600



In [None]:
lxgb = LGBMClassifier( class_weight='balanced',n_jobs=10, random_state=42 )
rfc.fit(X_train, y_train)
pred = rfc.predict(X_valid)
print("=" * 30, "valid_result", "=" * 30)
print(classification_report(y_valid, pred))
print("=" * 30, "valid_result", "=" * 30)
test_pred = dtc.predict(X_test)
print(classification_report(y_test, test_pred))

# KNN, SVN

In [73]:
from sklearn.impute import KNNImputer
from 