In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from logging import getLogger

# 常に全てのカラムを表示
pd.options.display.max_columns = None

# データ取り込み

In [19]:
# DAU
dau = pd.read_csv("input/section7-dau.csv", header="infer")
all_df = pd.DataFrame(dau)

In [20]:
# データ中身確認
print(all_df.head())

  region_month  region_day app_name   user_id device
0      2013-01  2013-01-01  game-02  10061580     FP
1      2013-01  2013-01-01  game-02  10154440     FP
2      2013-01  2013-01-01  game-02  10164762     SP
3      2013-01  2013-01-01  game-02  10165615     FP
4      2013-01  2013-01-01  game-02  10321356     FP


In [7]:
all_df.head(10)

Unnamed: 0,region_month,region_day,app_name,user_id,device
0,2013-01,2013-01-01,game-02,10061580,FP
1,2013-01,2013-01-01,game-02,10154440,FP
2,2013-01,2013-01-01,game-02,10164762,SP
3,2013-01,2013-01-01,game-02,10165615,FP
4,2013-01,2013-01-01,game-02,10321356,FP
5,2013-01,2013-01-01,game-02,10406653,SP
6,2013-01,2013-01-01,game-02,10447112,FP
7,2013-01,2013-01-01,game-02,10479169,FP
8,2013-01,2013-01-01,game-02,10494712,SP
9,2013-01,2013-01-01,game-02,10513749,FP


## データ加工

In [57]:
month_1 = all_df[all_df["region_month"] == "2013-01"]
month_2 = all_df[all_df["region_month"] == "2013-02"]
# 結合
user_device = pd.merge(month_1, month_2, on=["user_id", "app_name"], how='left')
user_device = user_device[["user_id", "device_x", "device_y"]].drop_duplicates()

# ２月に、アクセスがあったかどうかのフラグ
user_device['has_access'] = 0
user_device.loc[user_device['device_y'].isnull() == False, 'has_access'] = 1

# 1月からスマホを利用している人を除外
user_device = user_device[user_device["device_x"] == "FP"]

# 2月にアクセスがあるが、FuturePhoneをそのまま使っている人を除外
user_device = user_device[user_device["device_y"] != "FP"]

user_device = user_device.drop(['device_x', 'device_y'], axis=1)
user_device

Unnamed: 0,user_id,has_access
16,10154440,0
238,10528830,0
773,1163733,1
796,11727630,0
1496,13401362,1
1637,13723962,0
2530,16557842,1
2784,16856560,0
2957,17510879,0
3109,17794699,1


In [50]:
# ユーザーごとの日別クロス表
dau = pd.crosstab(index=month_1['user_id'],columns=month_1['region_day'])
dau.head()

region_day,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06,2013-01-07,2013-01-08,2013-01-09,2013-01-10,2013-01-11,2013-01-12,2013-01-13,2013-01-14,2013-01-15,2013-01-16,2013-01-17,2013-01-18,2013-01-19,2013-01-20,2013-01-21,2013-01-22,2013-01-23,2013-01-24,2013-01-25,2013-01-26,2013-01-27,2013-01-28,2013-01-29,2013-01-30,2013-01-31
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
397286,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1
471341,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
503874,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
512250,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
513811,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1


In [63]:
df = pd.merge(dau, user_device, on=["user_id"], how='inner')
df = df.fillna(0)
df.head(100)

Unnamed: 0,user_id,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06,2013-01-07,2013-01-08,2013-01-09,2013-01-10,2013-01-11,2013-01-12,2013-01-13,2013-01-14,2013-01-15,2013-01-16,2013-01-17,2013-01-18,2013-01-19,2013-01-20,2013-01-21,2013-01-22,2013-01-23,2013-01-24,2013-01-25,2013-01-26,2013-01-27,2013-01-28,2013-01-29,2013-01-30,2013-01-31,has_access
0,471341,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,503874,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1073544,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
3,1073864,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1163733,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,1,1,1,1,1,2,1,1,1,1
5,1454629,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1557628,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
7,2241462,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1
8,2313236,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
9,2477685,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


## 可視化

In [64]:
# EDA
import pandas_profiling as pp
import pixiedust as px
pp.ProfileReport(df)


findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


0,1
Number of variables,33
Number of observations,232
Total Missing (%),0.0%
Total size in memory,61.6 KiB
Average record size in memory,272.0 B

0,1
Numeric,32
Categorical,0
Boolean,1
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.31034
Minimum,0
Maximum,2
Zeros (%),70.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.49085
Coef of variation,1.5816
Kurtosis,0.09065
Mean,0.31034
MAD,0.43609
Skewness,1.1538
Sum,72
Variance,0.24093
Memory size,13.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,163,70.3%,
1,66,28.4%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,163,70.3%,
1,66,28.4%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,163,70.3%,
1,66,28.4%,
2,3,1.3%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.2931
Minimum,0
Maximum,2
Zeros (%),71.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.47477
Coef of variation,1.6198
Kurtosis,-0.050223
Mean,0.2931
MAD,0.41944
Skewness,1.1577
Sum,68
Variance,0.22541
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,166,71.6%,
1,64,27.6%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,166,71.6%,
1,64,27.6%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,166,71.6%,
1,64,27.6%,
2,2,0.9%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.31034
Minimum,0
Maximum,2
Zeros (%),69.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.48195
Coef of variation,1.5529
Kurtosis,-0.30638
Mean,0.31034
MAD,0.43341
Skewness,1.0575
Sum,72
Variance,0.23227
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,162,69.8%,
1,68,29.3%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,162,69.8%,
1,68,29.3%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,162,69.8%,
1,68,29.3%,
2,2,0.9%,

0,1
Distinct count,4
Unique (%),1.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.28879
Minimum,0
Maximum,3
Zeros (%),73.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,3
Range,3
Interquartile range,1

0,1
Standard deviation,0.50816
Coef of variation,1.7596
Kurtosis,3.3421
Mean,0.28879
MAD,0.42323
Skewness,1.7215
Sum,67
Variance,0.25823
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,170,73.3%,
1,58,25.0%,
2,3,1.3%,
3,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,170,73.3%,
1,58,25.0%,
2,3,1.3%,
3,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,170,73.3%,
1,58,25.0%,
2,3,1.3%,
3,1,0.4%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.28448
Minimum,0
Maximum,2
Zeros (%),72.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.48001
Coef of variation,1.6873
Kurtosis,0.54402
Mean,0.28448
MAD,0.41446
Skewness,1.3122
Sum,66
Variance,0.23041
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,169,72.8%,
1,60,25.9%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,169,72.8%,
1,60,25.9%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,169,72.8%,
1,60,25.9%,
2,3,1.3%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.30172
Minimum,0
Maximum,2
Zeros (%),71.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.49622
Coef of variation,1.6446
Kurtosis,0.57985
Mean,0.30172
MAD,0.43178
Skewness,1.2916
Sum,70
Variance,0.24623
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,166,71.6%,
1,62,26.7%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,166,71.6%,
1,62,26.7%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,166,71.6%,
1,62,26.7%,
2,4,1.7%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.28448
Minimum,0
Maximum,2
Zeros (%),72.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.4709
Coef of variation,1.6553
Kurtosis,0.093506
Mean,0.28448
MAD,0.41201
Skewness,1.2102
Sum,66
Variance,0.22175
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,168,72.4%,
1,62,26.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,168,72.4%,
1,62,26.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,168,72.4%,
1,62,26.7%,
2,2,0.9%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.27586
Minimum,0
Maximum,2
Zeros (%),73.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.46684
Coef of variation,1.6923
Kurtosis,0.24903
Mean,0.27586
MAD,0.40428
Skewness,1.2646
Sum,64
Variance,0.21794
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,170,73.3%,
1,60,25.9%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,170,73.3%,
1,60,25.9%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,170,73.3%,
1,60,25.9%,
2,2,0.9%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.27155
Minimum,0
Maximum,2
Zeros (%),74.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.47396
Coef of variation,1.7454
Kurtosis,0.81344
Mean,0.27155
MAD,0.40265
Skewness,1.3977
Sum,63
Variance,0.22464
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,172,74.1%,
1,57,24.6%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,172,74.1%,
1,57,24.6%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,172,74.1%,
1,57,24.6%,
2,3,1.3%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.26293
Minimum,0
Maximum,2
Zeros (%),74.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.46038
Coef of variation,1.751
Kurtosis,0.50708
Mean,0.26293
MAD,0.39213
Skewness,1.3498
Sum,61
Variance,0.21195
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,173,74.6%,
1,57,24.6%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,173,74.6%,
1,57,24.6%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,173,74.6%,
1,57,24.6%,
2,2,0.9%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.23276
Minimum,0
Maximum,2
Zeros (%),78.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.45313
Coef of variation,1.9468
Kurtosis,1.8531
Mean,0.23276
MAD,0.36318
Skewness,1.6874
Sum,54
Variance,0.20533
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,48,20.7%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,48,20.7%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,48,20.7%,
2,3,1.3%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.22845
Minimum,0
Maximum,2
Zeros (%),78.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.44084
Coef of variation,1.9297
Kurtosis,1.3787
Mean,0.22845
MAD,0.35646
Skewness,1.6039
Sum,53
Variance,0.19434
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,49,21.1%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,49,21.1%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,49,21.1%,
2,2,0.9%,

0,1
Distinct count,4
Unique (%),1.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.25
Minimum,0
Maximum,3
Zeros (%),77.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,3
Range,3
Interquartile range,0

0,1
Standard deviation,0.49016
Coef of variation,1.9607
Kurtosis,4.7752
Mean,0.25
MAD,0.38578
Skewness,2.0301
Sum,58
Variance,0.24026
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,179,77.2%,
1,49,21.1%,
2,3,1.3%,
3,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,179,77.2%,
1,49,21.1%,
2,3,1.3%,
3,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,179,77.2%,
1,49,21.1%,
2,3,1.3%,
3,1,0.4%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.25431
Minimum,0
Maximum,2
Zeros (%),75.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.45582
Coef of variation,1.7924
Kurtosis,0.69771
Mean,0.25431
MAD,0.38366
Skewness,1.4093
Sum,59
Variance,0.20777
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,175,75.4%,
1,55,23.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,175,75.4%,
1,55,23.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,175,75.4%,
1,55,23.7%,
2,2,0.9%,

0,1
Distinct count,4
Unique (%),1.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.21121
Minimum,0
Maximum,3
Zeros (%),80.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,3
Range,3
Interquartile range,0

0,1
Standard deviation,0.45892
Coef of variation,2.1729
Kurtosis,6.7362
Mean,0.21121
MAD,0.34048
Skewness,2.3487
Sum,49
Variance,0.21061
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,42,18.1%,
2,2,0.9%,
3,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,42,18.1%,
2,2,0.9%,
3,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,42,18.1%,
2,2,0.9%,
3,1,0.4%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.21983
Minimum,0
Maximum,2
Zeros (%),78.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.42533
Coef of variation,1.9348
Kurtosis,0.84625
Mean,0.21983
MAD,0.3449
Skewness,1.531
Sum,51
Variance,0.1809
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,49,21.1%,
2,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,49,21.1%,
2,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,49,21.1%,
2,1,0.4%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.19828
Minimum,0
Maximum,2
Zeros (%),80.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.41025
Coef of variation,2.0691
Kurtosis,1.5098
Mean,0.19828
MAD,0.31963
Skewness,1.7116
Sum,46
Variance,0.16831
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,44,19.0%,
2,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,44,19.0%,
2,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,44,19.0%,
2,1,0.4%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.22414
Minimum,0
Maximum,2
Zeros (%),78.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.43814
Coef of variation,1.9548
Kurtosis,1.5111
Mean,0.22414
MAD,0.35166
Skewness,1.639
Sum,52
Variance,0.19197
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,48,20.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,48,20.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,48,20.7%,
2,2,0.9%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.22845
Minimum,0
Maximum,2
Zeros (%),78.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.44084
Coef of variation,1.9297
Kurtosis,1.3787
Mean,0.22845
MAD,0.35646
Skewness,1.6039
Sum,53
Variance,0.19434
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,49,21.1%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,49,21.1%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,181,78.0%,
1,49,21.1%,
2,2,0.9%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.24569
Minimum,0
Maximum,2
Zeros (%),77.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.46985
Coef of variation,1.9124
Kurtosis,1.9202
Mean,0.24569
MAD,0.37912
Skewness,1.6828
Sum,57
Variance,0.22076
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,179,77.2%,
1,49,21.1%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,179,77.2%,
1,49,21.1%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,179,77.2%,
1,49,21.1%,
2,4,1.7%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.22414
Minimum,0
Maximum,2
Zeros (%),78.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.43814
Coef of variation,1.9548
Kurtosis,1.5111
Mean,0.22414
MAD,0.35166
Skewness,1.639
Sum,52
Variance,0.19197
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,48,20.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,48,20.7%,
2,2,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,48,20.7%,
2,2,0.9%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.21121
Minimum,0
Maximum,2
Zeros (%),80.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.44939
Coef of variation,2.1277
Kurtosis,3.1917
Mean,0.21121
MAD,0.34048
Skewness,1.985
Sum,49
Variance,0.20195
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,41,17.7%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,41,17.7%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,187,80.6%,
1,41,17.7%,
2,4,1.7%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.23276
Minimum,0
Maximum,2
Zeros (%),78.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.46259
Coef of variation,1.9874
Kurtosis,2.3447
Mean,0.23276
MAD,0.36519
Skewness,1.7891
Sum,54
Variance,0.21399
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,46,19.8%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,46,19.8%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,182,78.4%,
1,46,19.8%,
2,4,1.7%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.22414
Minimum,0
Maximum,2
Zeros (%),79.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.45748
Coef of variation,2.0411
Kurtosis,2.6604
Mean,0.22414
MAD,0.35553
Skewness,1.8645
Sum,52
Variance,0.20928
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,184,79.3%,
1,44,19.0%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,184,79.3%,
1,44,19.0%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,184,79.3%,
1,44,19.0%,
2,4,1.7%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.2069
Minimum,0
Maximum,2
Zeros (%),81.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.45617
Coef of variation,2.2048
Kurtosis,3.8422
Mean,0.2069
MAD,0.3371
Skewness,2.1211
Sum,48
Variance,0.20809
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,189,81.5%,
1,38,16.4%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,189,81.5%,
1,38,16.4%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,189,81.5%,
1,38,16.4%,
2,5,2.2%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.21552
Minimum,0
Maximum,2
Zeros (%),79.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.44247
Coef of variation,2.053
Kurtosis,2.4649
Mean,0.21552
MAD,0.34371
Skewness,1.8368
Sum,50
Variance,0.19578
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,185,79.7%,
1,44,19.0%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,185,79.7%,
1,44,19.0%,
2,3,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,185,79.7%,
1,44,19.0%,
2,3,1.3%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.21121
Minimum,0
Maximum,2
Zeros (%),81.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.46826
Coef of variation,2.2171
Kurtosis,3.9959
Mean,0.21121
MAD,0.34412
Skewness,2.1571
Sum,49
Variance,0.21927
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,189,81.5%,
1,37,15.9%,
2,6,2.6%,

Value,Count,Frequency (%),Unnamed: 3
0,189,81.5%,
1,37,15.9%,
2,6,2.6%,

Value,Count,Frequency (%),Unnamed: 3
0,189,81.5%,
1,37,15.9%,
2,6,2.6%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.17672
Minimum,0
Maximum,2
Zeros (%),84.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.43522
Coef of variation,2.4627
Kurtosis,5.5906
Mean,0.17672
MAD,0.2986
Skewness,2.4643
Sum,41
Variance,0.18941
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,196,84.5%,
1,31,13.4%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,196,84.5%,
1,31,13.4%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,196,84.5%,
1,31,13.4%,
2,5,2.2%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.22414
Minimum,0
Maximum,2
Zeros (%),79.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.45748
Coef of variation,2.0411
Kurtosis,2.6604
Mean,0.22414
MAD,0.35553
Skewness,1.8645
Sum,52
Variance,0.20928
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,184,79.3%,
1,44,19.0%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,184,79.3%,
1,44,19.0%,
2,4,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,184,79.3%,
1,44,19.0%,
2,4,1.7%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.19397
Minimum,0
Maximum,2
Zeros (%),82.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.44756
Coef of variation,2.3074
Kurtosis,4.5169
Mean,0.19397
MAD,0.32105
Skewness,2.2591
Sum,45
Variance,0.20031
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,192,82.8%,
1,35,15.1%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,192,82.8%,
1,35,15.1%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,192,82.8%,
1,35,15.1%,
2,5,2.2%,

0,1
Distinct count,3
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.20259
Minimum,0
Maximum,2
Zeros (%),81.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.45336
Coef of variation,2.2379
Kurtosis,4.0563
Mean,0.20259
MAD,0.33182
Skewness,2.1658
Sum,47
Variance,0.20553
Memory size,3.6 KiB

Value,Count,Frequency (%),Unnamed: 3
0,190,81.9%,
1,37,15.9%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,190,81.9%,
1,37,15.9%,
2,5,2.2%,

Value,Count,Frequency (%),Unnamed: 3
0,190,81.9%,
1,37,15.9%,
2,5,2.2%,

0,1
Distinct count,2
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.18103

0,1
0,190
1,42

Value,Count,Frequency (%),Unnamed: 3
0,190,81.9%,
1,42,18.1%,

0,1
Distinct count,232
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,29524000
Minimum,471341
Maximum,677996362
Zeros (%),0.0%

0,1
Minimum,471341
5-th percentile,3113100
Q1,12548000
Median,23654000
Q3,41382000
95-th percentile,59193000
Maximum,677996362
Range,677525021
Interquartile range,28834000

0,1
Standard deviation,46125000
Coef of variation,1.5623
Kurtosis,170.57
Mean,29524000
MAD,18008000
Skewness,12.131
Sum,6849556726
Variance,2127500000000000
Memory size,13.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1073544,1,0.4%,
51277074,1,0.4%,
23689923,1,0.4%,
12083954,1,0.4%,
35125949,1,0.4%,
15571132,1,0.4%,
31066299,1,0.4%,
31697594,1,0.4%,
32623795,1,0.4%,
36756145,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
471341,1,0.4%,
503874,1,0.4%,
1073544,1,0.4%,
1073864,1,0.4%,
1163733,1,0.4%,

Value,Count,Frequency (%),Unnamed: 3
61132242,1,0.4%,
61540141,1,0.4%,
61570858,1,0.4%,
61613038,1,0.4%,
677996362,1,0.4%,

Unnamed: 0,user_id,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06,2013-01-07,2013-01-08,2013-01-09,2013-01-10,2013-01-11,2013-01-12,2013-01-13,2013-01-14,2013-01-15,2013-01-16,2013-01-17,2013-01-18,2013-01-19,2013-01-20,2013-01-21,2013-01-22,2013-01-23,2013-01-24,2013-01-25,2013-01-26,2013-01-27,2013-01-28,2013-01-29,2013-01-30,2013-01-31,has_access
0,471341,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,503874,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1073544,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
3,1073864,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1163733,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,1,1,1,1,1,2,1,1,1,1


In [65]:
px.display(df)

user_id,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06,2013-01-07,2013-01-08,2013-01-09,2013-01-10,2013-01-11,2013-01-12,2013-01-13,2013-01-14,2013-01-15,2013-01-16,2013-01-17,2013-01-18,2013-01-19,2013-01-20,2013-01-21,2013-01-22,2013-01-23,2013-01-24,2013-01-25,2013-01-26,2013-01-27,2013-01-28,2013-01-29,2013-01-30,2013-01-31,has_access
19410760,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1
44905081,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
18195952,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
25184024,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24900784,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
24609593,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27249550,0,0,0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10528830,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
60725457,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
50850885,1,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [66]:
from pivottablejs import pivot_ui
pivot_ui(df, outfile_path="logistic.html")

##　モデリング

In [74]:
# 交差検証
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

y = df["has_access"]
X = df.iloc[:, 1:-1]

x_train, x_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=1)

In [76]:
# ロジスティック回帰
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', random_state=0)
model.fit(x_train, y_train)
scores = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=3)
print("正解度：{}、標準偏差：{}".format(np.mean(scores),np.std(scores)))


正解度：0.9333333333333332、標準偏差：0.05443310539518172


