In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

## sample_submission_zero.csv(Testing Set)
### the test set, containing the user ids, in the format that we expect you to submit
#### is_churn: This is what you will predict. Churn is defined as whether the user did not continue the subscription within 30 days of expiration. is_churn = 1 means 流失,is_churn = 0 means renewal.

In [2]:
kk_submission01 = pd.read_csv("sample_submission_zero.csv")

In [3]:
kk_submission01.head()

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,0
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,0
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,0
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,0


In [4]:
kk_submission01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970960 entries, 0 to 970959
Data columns (total 2 columns):
msno        970960 non-null object
is_churn    970960 non-null int64
dtypes: int64(1), object(1)
memory usage: 14.8+ MB


In [5]:
kk_submission01["is_churn"].unique()

array([0], dtype=int64)

In [6]:
print((kk_submission01["is_churn"] == 0).sum())
print((kk_submission01["is_churn"] == 1).sum())

970960
0


## sample_submission_v2.csv
### same format as sample_submission_zero.csv, refreshed 11/06/2017, contains the test data for April, 2017.

In [7]:
kk_submission02 = pd.read_csv("sample_submission_v2.csv")

In [8]:
kk_submission02.head()

Unnamed: 0,msno,is_churn
0,4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=,0
1,aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=,0
2,rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=,0
3,WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=,0
4,aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=,0


In [9]:
kk_submission02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907471 entries, 0 to 907470
Data columns (total 2 columns):
msno        907471 non-null object
is_churn    907471 non-null int64
dtypes: int64(1), object(1)
memory usage: 13.8+ MB


In [10]:
kk_submission02["is_churn"].unique()

array([0], dtype=int64)

In [11]:
print((kk_submission02["is_churn"] == 0).sum())
print((kk_submission02["is_churn"] == 1).sum())

907471
0


### ※小結: sample_submission與sample_submissionV2沒有流失值(? 怪怪的

## 合併submission(Testing Data)

In [12]:
kk_Test = pd.concat([kk_submission01, kk_submission02], axis=0)

In [13]:
kk_Test.head()

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,0
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,0
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,0
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,0


In [14]:
kk_Test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1878431 entries, 0 to 907470
Data columns (total 2 columns):
msno        object
is_churn    int64
dtypes: int64(1), object(1)
memory usage: 43.0+ MB


## ======================================================================

## train.csv
### the train set, containing the user ids and whether they have churned.
#### A. msno: <font color=#0000FF> user id
#### B. is_churn: <font color=#0000FF> This is the target variable. Churn is defined as whether the user did not continue the subscription within 30 days of expiration. is_churn = 1 means churn,is_churn = 0 means renewal.

In [15]:
kk_train01 = pd.read_csv("train.csv")

In [16]:
kk_train01.head()

Unnamed: 0,msno,is_churn
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1


In [17]:
kk_train01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992931 entries, 0 to 992930
Data columns (total 2 columns):
msno        992931 non-null object
is_churn    992931 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.2+ MB


In [18]:
kk_train01.describe()

Unnamed: 0,is_churn
count,992931.0
mean,0.063923
std,0.244616
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [19]:
kk_train01["is_churn"].unique()

array([1, 0], dtype=int64)

In [20]:
print((kk_train01["is_churn"] == 0).sum())
print((kk_train01["is_churn"] == 1).sum())

929460
63471


## train_v2.csv
### same format as train.csv, refreshed 11/06/2017, contains the churn data for March, 2017.

In [21]:
kk_train02 = pd.read_csv("train_v2.csv")

In [22]:
kk_train02.head()

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1


In [23]:
kk_train02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970960 entries, 0 to 970959
Data columns (total 2 columns):
msno        970960 non-null object
is_churn    970960 non-null int64
dtypes: int64(1), object(1)
memory usage: 14.8+ MB


In [24]:
kk_train02["is_churn"].unique()

array([1, 0], dtype=int64)

In [25]:
print((kk_train02["is_churn"] == 0).sum())
print((kk_train02["is_churn"] == 1).sum())

883630
87330


## ----------------------------------------------------------------------------------------------------------------------------

## Training Data(合併train與train_v2)

In [26]:
kk_train = pd.concat([kk_train01, kk_train02], axis=0)

In [27]:
kk_train.shape

(1963891, 2)

In [28]:
kk_train.head()

Unnamed: 0,msno,is_churn
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1


In [29]:
print((kk_train["is_churn"] == 0).sum())
print((kk_train["is_churn"] == 1).sum())

1813090
150801


### ※小結: 流失人數 vs 續約人數比例差異較大(超過1:9)

## ====================================================================

## members.csv
### user information. Note that not every user in the dataset is available.
#### A. bd: age. Note: <font color=#0000FF> this column has outlier values ranging from -7000 to 2015, please use your judgement.
#### B. registered_via:<font color=#0000FF>  registration method
#### C. registration_init_time: <font color=#0000FF> format %Y%m%d
#### D. expiration_date: <font color=#0000FF> format %Y%m%d, taken as a snapshot at which the member.csv is extracted. Not representing the actual churn behavior.<font color=#FF3333>=>Raw Data沒看到這一項特徵

In [30]:
kk_members = pd.read_csv("members_v3.csv")

In [31]:
kk_members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


In [32]:
print(kk_members.shape)  # 共6769473筆資料
print(kk_members["gender"].unique())
print(kk_members["gender"].isnull().sum())  # 性別項空值有442萬項
print((kk_members["gender"] == "female").sum())  # 女性人數1144613
print((kk_members["gender"] == "male").sum())  # 男性人數1195355

(6769473, 6)
[nan 'female' 'male']
4429505
1144613
1195355


### ※欄位特徵於下方與TrainingData合併分析

## =======================================================================

## transactions.csv(交易資料01)
### transactions of users up until 2/28/2017.
#### A. msno: <font color=#0000FF> user id
#### B. payment_method_id:<font color=#0000FF>  payment method
#### C. payment_plan_days: <font color=#0000FF> length of membership plan in days
#### D. plan_list_price: <font color=#0000FF> in New Taiwan Dollar (NTD)
#### E. actual_amount_paid: <font color=#0000FF> in New Taiwan Dollar (NTD)
#### F. transaction_date: <font color=#0000FF> format %Y%m%d
#### G. membership_expire_date: <font color=#0000FF> format %Y%m%d
#### H. is_cancel: <font color=#0000FF> whether or not the user canceled the membership in this transaction.

In [33]:
kk_transac01 = pd.read_csv("transactions.csv")

In [34]:
kk_transac01.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150930,20151101,0
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,20150930,20151031,0
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,20150930,20160427,0
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,20150930,20151128,0
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,20150930,20151121,0


## transactions_v2.csv(交易資料02)

### same format as transactions.csv, refreshed 11/06/2017, contains the transactions data until 3/31/2017.

In [35]:
kk_transac02 = pd.read_csv("transactions_v2.csv")

In [36]:
kk_transac02.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0


## ----------------------------------------------------------------------------------------------------------------------------

## 合併Transaction01與Transaction02(交易項目)

In [37]:
kk_transaction = pd.concat([kk_transac01, kk_transac02], axis=0)

In [38]:
kk_transaction.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150930,20151101,0
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,20150930,20151031,0
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,20150930,20160427,0
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,20150930,20151128,0
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,20150930,20151121,0


In [39]:
kk_transaction.shape # 共22978755筆資料，9個特徵

(22978755, 9)

In [40]:
(kk_transaction["msno"] == "YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=").sum() # 取某個user的交易資料筆數有27筆

27

In [41]:
kk_transaction[kk_transaction["msno"] == "YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc="].head(10) # 取此user某10筆交易資料

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150930,20151101,0
753139,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20151031,20151201,0
1234428,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150630,20150801,0
1917887,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150228,20150401,0
1957479,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20161130,20170101,0
3482816,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150731,20150901,0
5413134,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20160430,20160601,0
7106202,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20160731,20160901,0
7619099,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150131,20150301,0
10161584,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20160831,20161001,0


#### ※每次交易價格有以下多種，可能為月租，可能為年租，或是有部分是優惠(0元?)

In [42]:
kk_transaction["plan_list_price"].unique()

array([ 129,  149,    0,   99,  100,  119,  150, 1788,  180,  894,   50,
        120,  500, 1599,  536,  799,  480,  477,  596, 1200,  300,  930,
        699,  450,   35,  447,  105,  298,  134, 1000,  350,  124,  400,
          1,  131,  126,   10, 1150,   70, 1520,  265,  143, 2000,  800,
         15,   30,  210,  760,  600, 1825,   44, 1299, 1399, 1260, 1300],
      dtype=int64)

#### ※確認每次交易的付費天數有如下多種，其中30與31應為月租，另外短租包含7天、10天等等，還有其他種不同天數付費交易

In [43]:
kk_transaction["payment_plan_days"].unique()

array([ 30,   0,  10,  31,   7, 410, 195, 100, 395,   1, 180, 120,  60,
       400,  14, 360, 200,  35,  90,  21, 240, 450,  70,  80,  45, 110,
       365,  66, 270,  99,   2, 230,   3,  15, 425, 415,  95], dtype=int64)

#### ※30天(月租)套餐數量高達2017萬筆

In [44]:
kk_transaction["payment_plan_days"].value_counts()

30     20174288
0        872342
31       766612
7        589807
410      162236
195      138802
180       76172
10        38632
90        31440
100       28252
395       20543
120       13619
60        10301
360       10144
200        8946
14         6447
1          5435
240        4528
365        4150
400        3673
415        3299
450        3033
270        1187
45         1123
21          880
35          854
70          636
80          385
99          339
110         179
66          175
230         138
3            68
2            67
15           21
425           1
95            1
Name: payment_plan_days, dtype: int64

#### ※交易方法(?)有如下多種

In [45]:
kk_transaction["payment_method_id"].unique()

array([41, 39, 21, 37, 40, 34, 33, 31, 23, 38, 19, 24, 14, 36, 27, 35, 30,
       29, 28, 22, 26, 32, 25, 20, 17,  5, 18, 11,  7, 13, 12, 16, 10, 15,
        8,  6,  3,  2,  4,  1], dtype=int64)

#### ※特徵"is_auto_renew"沒有說明是什麼，不過分成1與0，可能是分為自動續約與不續約

In [46]:
kk_transaction["is_auto_renew"].value_counts()

1    19481725
0     3497030
Name: is_auto_renew, dtype: int64

## 小結: <font color=#FF3333> Transaction的msno有重複的，表示同一個人有很多次交易紀錄

## =======================================================================

## user_logs.csv
### daily user logs describing listening behaviors of a user. Data collected until 2/28/2017.
#### A. date: <font color=#0000FF> format %Y%m%d
#### B. num_25: <font color=#0000FF> # of songs played less than 25% of the song length
#### C. num_985: <font color=#0000FF> # of songs played between 75% to 98.5% of the song length
#### D. total_secs: <font color=#0000FF> total seconds played

In [47]:
# 檔案太大，先不取
"""
kk_userlog = pd.read_csv("user_logs.csv", sep=',',engine = 'python',iterator=True)
loop = True
chunkSize = 1000
chunks = []
index=0
while loop:
    try:
        print(index)
        chunk = kk_userlog.get_chunk(chunkSize)
        chunks.append(chunk)
        index+=1

    except StopIteration:
        loop = False
        print("Iteration is stopped.")
print('Start')
kk_userlog = pd.concat(chunks, ignore_index= True)
"""

'\nkk_userlog = pd.read_csv("user_logs.csv", sep=\',\',engine = \'python\',iterator=True)\nloop = True\nchunkSize = 1000\nchunks = []\nindex=0\nwhile loop:\n    try:\n        print(index)\n        chunk = kk_userlog.get_chunk(chunkSize)\n        chunks.append(chunk)\n        index+=1\n\n    except StopIteration:\n        loop = False\n        print("Iteration is stopped.")\nprint(\'Start\')\nkk_userlog = pd.concat(chunks, ignore_index= True)\n'

In [48]:
kk_userlog = pd.read_csv("user_logs.csv", iterator=True)

In [49]:
# 歷遍(僅能觀察資料，無法進行資料整併)
chunk = kk_userlog.get_chunk(5)
chunk

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,rxIP2f2aN0rYNp+toI0Obt/N/FYQX8hcO1fTmmy2h34=,20150513,0,0,0,0,1,1,280.335
1,rxIP2f2aN0rYNp+toI0Obt/N/FYQX8hcO1fTmmy2h34=,20150709,9,1,0,0,7,11,1658.948
2,yxiEWwE9VR5utpUecLxVdQ5B7NysUPfrNtGINaM2zA8=,20150105,3,3,0,0,68,36,17364.956
3,yxiEWwE9VR5utpUecLxVdQ5B7NysUPfrNtGINaM2zA8=,20150306,1,0,1,1,97,27,24667.317
4,yxiEWwE9VR5utpUecLxVdQ5B7NysUPfrNtGINaM2zA8=,20150501,3,0,0,0,38,38,9649.029


## user_logs_v2.csv
### same format as user_logs.csv, refreshed 11/06/2017, contains the user logs data until 3/31/2017.

In [50]:
kk_userlogV2 = pd.read_csv("user_logs_v2.csv")

In [51]:
kk_userlogV2.head()

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=,20170331,8,4,0,1,21,18,6309.273
1,nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=,20170330,2,2,1,0,9,11,2390.699
2,2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=,20170331,52,3,5,3,84,110,23203.337
3,ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=,20170331,176,4,2,2,19,191,7100.454
4,EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=,20170331,2,1,0,1,112,93,28401.558


#### ※userlog的欄位特徵主要分成每個用戶聽一首歌的時間以及總時間，是否可以將總時間進行分類與流失率進行比較；或是將容易聽完整首歌的比例之類的進行分析(???

## ========================================================================

In [52]:
print(f'submission01: {kk_submission01.shape}')
print(f'submission02: {kk_submission02.shape}')
print(f'members: {kk_members.shape}')
print(f'train01: {kk_train01.shape}')
print(f'train02: {kk_train02.shape}')
print(f'TrainingData: {kk_train.shape}')
print(f'transaction: {kk_transaction.shape}')
print(f'userlogV2: {kk_userlogV2.shape}')

submission01: (970960, 2)
submission02: (907471, 2)
members: (6769473, 6)
train01: (992931, 2)
train02: (970960, 2)
TrainingData: (1963891, 2)
transaction: (22978755, 9)
userlogV2: (18396362, 9)


## ====================================================================

# Data Cleaning

## 合併Training與Members欄位(取相同msno)

In [53]:
TrainingData = pd.merge(kk_train, kk_members, how="left", on="msno")

In [54]:
TrainingData['registration_init_time'] = TrainingData.registration_init_time.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").date() if pd.notnull(x) else "NAN" )

In [55]:
TrainingData.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,18.0,36.0,female,9.0,2005-04-06
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,10.0,38.0,male,9.0,2005-04-07
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,11.0,27.0,female,9.0,2005-10-16
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,13.0,23.0,female,9.0,2005-11-02
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,3.0,27.0,male,9.0,2005-12-28


#### ※共有21個城市的客戶及各自城市的客戶數量

In [63]:
print(TrainingData["city"].unique())
print(TrainingData["city"].value_counts())

[18. 10. 11. 13.  3.  6.  4. 14. 22. 17.  5.  9.  1. 15. nan 12.  8.  7.
 21. 20. 16. 19.]
1.0     897987
13.0    195417
5.0     142005
4.0      95172
15.0     86543
22.0     84120
6.0      52088
14.0     40180
12.0     22937
9.0      19084
11.0     18174
18.0     15634
8.0      15279
10.0     13003
17.0     11022
21.0     10485
3.0      10146
7.0       5318
16.0      1900
20.0      1354
19.0       280
Name: city, dtype: int64


#### ※年齡的數據有點亂，可能要判斷一下是什麼格式?(Note: this column has outlier values ranging from -7000 to 2015, please use your judgement.)

In [64]:
TrainingData["bd"].unique()

array([ 3.600e+01,  3.800e+01,  2.700e+01,  2.300e+01,  2.900e+01,
        2.200e+01,  3.100e+01,  4.300e+01,  4.700e+01,  4.200e+01,
        3.300e+01,  2.500e+01,  2.800e+01,  2.600e+01,  2.000e+01,
        3.700e+01,  3.400e+01,  1.600e+01,  0.000e+00,  3.900e+01,
        3.500e+01,        nan,  4.600e+01,  2.100e+01,  2.400e+01,
        1.900e+01,  4.000e+01,  3.200e+01,  3.000e+01,  1.700e+01,
        1.800e+01,  4.400e+01,  5.700e+01,  4.900e+01,  5.100e+01,
        1.140e+02,  4.100e+01,  5.400e+01,  5.300e+01,  4.800e+01,
        1.400e+01,  4.500e+01,  9.800e+01,  6.000e+01,  6.200e+01,
        5.500e+01,  5.600e+01,  5.000e+01,  1.500e+01,  5.200e+01,
        6.800e+01,  5.800e+01,  8.900e+01,  6.300e+01,  6.500e+01,
        1.050e+02,  7.200e+01,  1.100e+01,  1.020e+02,  1.300e+01,
        6.600e+01,  6.100e+01,  6.000e+00,  1.010e+02,  6.400e+01,
        9.700e+01,  5.900e+01,  1.090e+02,  1.200e+01,  8.300e+01,
        1.060e+02,  1.040e+02,  9.200e+01,  9.400e+01,  8.100e

#### ※registered_via共有5種登入方式=>分別代表什麼???

In [56]:
TrainingData["registered_via"].unique()

array([ 9.,  7., nan,  3.,  4., 13.])

In [57]:
TrainingData.describe().astype(np.int64)

Unnamed: 0,is_churn,city,bd,registered_via
count,1963891,1738128,1738128,1738128
mean,0,5,13,6
std,0,6,20,1
min,0,1,-3152,3
25%,0,1,0,7
50%,0,1,0,7
75%,0,13,27,9
max,1,22,2016,13


In [58]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1963891 entries, 0 to 1963890
Data columns (total 7 columns):
msno                      object
is_churn                  int64
city                      float64
bd                        float64
gender                    object
registered_via            float64
registration_init_time    object
dtypes: float64(3), int64(1), object(3)
memory usage: 119.9+ MB


In [59]:
TrainingData.shape  # 有196萬筆資料

(1963891, 7)

In [60]:
print(100*(TrainingData.isnull().sum()/len(TrainingData))) # 各特徵missing value比例

msno                       0.000000
is_churn                   0.000000
city                      11.495699
bd                        11.495699
gender                    60.252529
registered_via            11.495699
registration_init_time     0.000000
dtype: float64


### 小結1: <font color=#FF3333> 特徵"city", "bd", "gender", "registered_via", "registration_init_time"都有missing value
### 小結2: <font color=#FF3333> 其中特徵"gender"達到60%的missing value

## ※後續處理
## 1.比較欄位特徵關聯性
## 2.missing value處理...