# 어떤 고객들이 떠나가고 있는가?

## 1. Loading data

#### 1) 필요한 라이브러리들을 호출

In [1]:
#데이터 분석에 필요한 패키지들을 파이썬으로 호출
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#### 2) 필요 데이터 경로 지정

In [2]:
#분석에 필요한 데이터들을 파이썬으로 호출
dauPath = os.path.expanduser("Week10Data/week10-dau.csv")
userInfo = os.path.expanduser("Week10Data/week10-user_info.csv")

#### 3) 각 변수별로 데이터 할당 

In [3]:
dau = pd.read_csv(dauPath, sep=",")
user = pd.read_csv(userInfo, sep=",")

#### 4) 호출된 데이터 정보 확인

In [None]:
dau.info()
dau[:5]

In [None]:
user.info()
user[0:5]

# 2. Data cleaning

#### 1) 각 데이터들을 결합

In [4]:
#Dau데이터에 user 데이터를 결합하기
DauUser = dau.merge(user, on=["user_id", "app_name"])
DauUser.info()
DauUser[:5]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170398 entries, 0 to 170397
Data columns (total 7 columns):
log_date        170398 non-null object
app_name        170398 non-null object
user_id         170398 non-null int64
install_date    170398 non-null object
gender          170398 non-null object
generation      170398 non-null int64
device_type     170398 non-null object
dtypes: int64(2), object(5)
memory usage: 10.4+ MB


Unnamed: 0,log_date,app_name,user_id,install_date,gender,generation,device_type
0,2017-08-01,game-01,33754,2017-08-01,M,20,iOS
1,2017-08-02,game-01,33754,2017-08-01,M,20,iOS
2,2017-08-01,game-01,28598,2017-07-16,M,50,iOS
3,2017-08-04,game-01,28598,2017-07-16,M,50,iOS
4,2017-08-12,game-01,28598,2017-07-16,M,50,iOS


#### 2)월별 정렬을 위한 문자 처리

In [5]:
#일자 데이터 중 '월'을 떼어내 별도의 열로 저장
DauUser['log_date_M'] = DauUser['log_date'].str[:7]
DauUser[:5]

Unnamed: 0,log_date,app_name,user_id,install_date,gender,generation,device_type,log_date_M
0,2017-08-01,game-01,33754,2017-08-01,M,20,iOS,2017-08
1,2017-08-02,game-01,33754,2017-08-01,M,20,iOS,2017-08
2,2017-08-01,game-01,28598,2017-07-16,M,50,iOS,2017-08
3,2017-08-04,game-01,28598,2017-07-16,M,50,iOS,2017-08
4,2017-08-12,game-01,28598,2017-07-16,M,50,iOS,2017-08


# 3. Data analysis

#### 1)크로스 집계를 이용한 분석

#### 성별에 따른 접속 일자 

In [6]:
pd.crosstab(DauUser["log_date_M"],DauUser["gender"])

gender,F,M
log_date_M,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08,47358,46849
2017-09,38041,38150


#### 연령에 따른 접속일자

In [7]:
pd.crosstab(DauUser["log_date_M"], DauUser["generation"])

generation,10,20,30,40,50
log_date_M,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08,18786,33682,28080,8830,4829
2017-09,15392,27241,22229,7494,3835


#### 성별-연령에 따른 접속일자 - 첫 번째 방법

In [8]:
pd.crosstab(DauUser["log_date_M"], columns=[DauUser["gender"], DauUser["generation"]])

gender,F,F,F,F,F,M,M,M,M,M
generation,10,20,30,40,50,10,20,30,40,50
log_date_M,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2017-08,9091,17192,14219,4599,2257,9695,16490,13861,4231,2572
2017-09,7316,13628,11460,3856,1781,8076,13613,10769,3638,2054


#### 성별-연령에 따른 접속일자 - 두번째 방법

In [9]:
df_cross = pd.crosstab(index=DauUser['log_date_M'], columns=[DauUser['gender'], DauUser['generation']])
df_cross.columns = ["{0}_{1}".format(l1, l2) for l1, l2 in df_cross.columns]
df_cross

Unnamed: 0_level_0,F_10,F_20,F_30,F_40,F_50,M_10,M_20,M_30,M_40,M_50
log_date_M,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-08,9091,17192,14219,4599,2257,9695,16490,13861,4231,2572
2017-09,7316,13628,11460,3856,1781,8076,13613,10769,3638,2054


#### 디바이스 유형에 따른 접속일자

In [10]:
pd.crosstab(DauUser["log_date_M"], DauUser["device_type"])

device_type,Android,iOS
log_date_M,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08,46990,47217
2017-09,29659,46532


# 4. Data Visualization

#### 1)디바이스 유형별 접속일자의 시각화 - 라인 그래프

In [None]:
data = pd.crosstab(DauUser["log_date"], DauUser["device_type"])

In [None]:
graph = pd.DataFrame(data)
graph.plot.line(figsize=(15, 5), style="--",)

#### 2)디바이스 유형별 접속일자의 시각화 - Heat Map

In [None]:
sns.set(rc={"figure.figsize":(10, 20)})
sns.heatmap(data)