In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from logging import getLogger

# 常に全てのカラムを表示
pd.options.display.max_columns = None

# データ取り込み

In [3]:
# DAU
dau = pd.read_csv("input/section4-dau.csv", header="infer")
dau = pd.DataFrame(dau)

# DPU
user_info = pd.read_csv("input/section4-user_info.csv", header="infer")
user_info = pd.DataFrame(user_info)

In [4]:
# データ中身確認
print(dau.head())
print(user_info.head())

     log_date app_name  user_id
0  2013-08-01  game-01    33754
1  2013-08-01  game-01    28598
2  2013-08-01  game-01    30306
3  2013-08-01  game-01      117
4  2013-08-01  game-01     6605
  install_date app_name  user_id gender  generation device_type
0   2013-04-15  game-01        1      M          40         iOS
1   2013-04-15  game-01        2      M          10     Android
2   2013-04-15  game-01        3      F          40         iOS
3   2013-04-15  game-01        4      M          10     Android
4   2013-04-15  game-01        5      M          40         iOS


In [27]:
# 結合
all_df = pd.merge(dau, user_info, on=["app_name", "user_id", "user_id"], how="left")

In [11]:
all_df.head()

Unnamed: 0,log_date,app_name,user_id,install_date,gender,generation,device_type
0,2013-08-01,game-01,33754,2013-08-01,M,20,iOS
1,2013-08-01,game-01,28598,2013-07-16,M,50,iOS
2,2013-08-01,game-01,30306,2013-07-20,F,30,iOS
3,2013-08-01,game-01,117,2013-04-17,F,20,iOS
4,2013-08-01,game-01,6605,2013-05-02,M,20,iOS


## データ加工

In [28]:
# 日付修正
log_date_month = all_df['log_date'].str.split('-', expand=True)
log_date_month.columns = ['year', 'month', 'day']

log_date_year_month = log_date_month['year'] + "-" + log_date_month['month']
log_date_year_month.name = "log_date_year_month"
all_df = pd.concat((all_df, log_date_year_month), axis=1)
all_df.head()

install_date_month = all_df['install_date'].str.split('-', expand=True)
install_date_month.columns = ['year', 'month', 'day']

install_date_year_month = install_date_month['year'] + "-" + install_date_month['month']
install_date_year_month.name = "install_date_year_month"
all_df = pd.concat((all_df, install_date_year_month), axis=1)
all_df.head()

# 日付型に変換
all_df["log_date_formatted"] = 0
all_df["install_date_formatted"] = 0
all_df["log_date_formatted"] = pd.to_datetime(all_df["log_date"], format='%Y-%m-%d')
all_df["install_date_formatted"] = pd.to_datetime(all_df["install_date"], format='%Y-%m-%d')

## 可視化

In [29]:
# EDA
import pandas_profiling as pp
import pixiedust as px
pp.ProfileReport(all_df)


findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


0,1
Number of variables,11
Number of observations,170360
Total Missing (%),0.0%
Total size in memory,15.6 MiB
Average record size in memory,96.0 B

0,1
Numeric,2
Categorical,6
Boolean,0
Date,2
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Constant value,game-01

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
iOS,93739
Android,76621

Value,Count,Frequency (%),Unnamed: 3
iOS,93739,55.0%,
Android,76621,45.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
F,85370
M,84990

Value,Count,Frequency (%),Unnamed: 3
F,85370,50.1%,
M,84990,49.9%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,24.388
Minimum,10
Maximum,50
Zeros (%),0.0%

0,1
Minimum,10
5-th percentile,10
Q1,20
Median,20
Q3,30
95-th percentile,50
Maximum,50
Range,40
Interquartile range,10

0,1
Standard deviation,10.697
Coef of variation,0.43861
Kurtosis,-0.20473
Mean,24.388
MAD,8.9103
Skewness,0.52566
Sum,4154780
Variance,114.42
Memory size,2.6 MiB

Value,Count,Frequency (%),Unnamed: 3
20,60900,35.7%,
30,50298,29.5%,
10,34176,20.1%,
40,16322,9.6%,
50,8664,5.1%,

Value,Count,Frequency (%),Unnamed: 3
10,34176,20.1%,
20,60900,35.7%,
30,50298,29.5%,
40,16322,9.6%,
50,8664,5.1%,

Value,Count,Frequency (%),Unnamed: 3
10,34176,20.1%,
20,60900,35.7%,
30,50298,29.5%,
40,16322,9.6%,
50,8664,5.1%,

0,1
Distinct count,169
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
2013-08-01,3337
2013-04-27,2903
2013-08-02,2822
Other values (166),161298

Value,Count,Frequency (%),Unnamed: 3
2013-08-01,3337,2.0%,
2013-04-27,2903,1.7%,
2013-08-02,2822,1.7%,
2013-04-17,2402,1.4%,
2013-04-18,2305,1.4%,
2013-08-05,2232,1.3%,
2013-08-03,2124,1.2%,
2013-08-14,1871,1.1%,
2013-08-07,1790,1.1%,
2013-07-17,1757,1.0%,

0,1
Distinct count,169
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2013-04-15 00:00:00
Maximum,2013-09-30 00:00:00

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
2013-08,43858
2013-07,38737
2013-06,26390
Other values (3),61375

Value,Count,Frequency (%),Unnamed: 3
2013-08,43858,25.7%,
2013-07,38737,22.7%,
2013-06,26390,15.5%,
2013-05,22697,13.3%,
2013-09,21183,12.4%,
2013-04,17495,10.3%,

0,1
Distinct count,61
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
2013-08-05,4040
2013-08-21,3913
2013-09-11,3786
Other values (58),158621

Value,Count,Frequency (%),Unnamed: 3
2013-08-05,4040,2.4%,
2013-08-21,3913,2.3%,
2013-09-11,3786,2.2%,
2013-08-28,3785,2.2%,
2013-08-19,3762,2.2%,
2013-08-12,3686,2.2%,
2013-08-09,3671,2.2%,
2013-08-26,3665,2.2%,
2013-09-09,3639,2.1%,
2013-08-01,3589,2.1%,

0,1
Distinct count,61
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2013-08-01 00:00:00
Maximum,2013-09-30 00:00:00

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
2013-08,94185
2013-09,76175

Value,Count,Frequency (%),Unnamed: 3
2013-08,94185,55.3%,
2013-09,76175,44.7%,

0,1
Distinct count,25588
Unique (%),15.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,26162
Minimum,1
Maximum,49525
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1340
Q1,14349
Median,28976
Q3,37611
95-th percentile,44688
Maximum,49525
Range,49524
Interquartile range,23262

0,1
Standard deviation,13729
Coef of variation,0.52479
Kurtosis,-1.1021
Mean,26162
MAD,11952
Skewness,-0.34559
Sum,4456876837
Variance,188490000
Memory size,2.6 MiB

Value,Count,Frequency (%),Unnamed: 3
28944,61,0.0%,
20543,61,0.0%,
25707,61,0.0%,
7842,61,0.0%,
10710,61,0.0%,
13015,61,0.0%,
31503,61,0.0%,
12547,61,0.0%,
9789,61,0.0%,
24249,61,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,29,0.0%,
2,8,0.0%,
3,48,0.0%,
4,6,0.0%,
6,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
49521,1,0.0%,
49522,1,0.0%,
49523,1,0.0%,
49524,1,0.0%,
49525,1,0.0%,

Unnamed: 0,log_date,app_name,user_id,install_date,gender,generation,device_type,log_date_year_month,install_date_year_month,log_date_formatted,install_date_formatted
0,2013-08-01,game-01,33754,2013-08-01,M,20,iOS,2013-08,2013-08,2013-08-01,2013-08-01
1,2013-08-01,game-01,28598,2013-07-16,M,50,iOS,2013-08,2013-07,2013-08-01,2013-07-16
2,2013-08-01,game-01,30306,2013-07-20,F,30,iOS,2013-08,2013-07,2013-08-01,2013-07-20
3,2013-08-01,game-01,117,2013-04-17,F,20,iOS,2013-08,2013-04,2013-08-01,2013-04-17
4,2013-08-01,game-01,6605,2013-05-02,M,20,iOS,2013-08,2013-05,2013-08-01,2013-05-02


In [30]:
px.display(all_df)

In [31]:
from pivottablejs import pivot_ui
pivot_ui(all_df)