In [1]:
import os
import dask.dataframe as dd
import numpy as np
import pandas as pd
import scipy.stats
import featuretools as ft

import warnings
warnings.filterwarnings('ignore')

In [2]:
log_root = r'D:\issp_data\pre_log'
out_root = r'D:\issp_data\fe_log_new'

In [8]:
func_table = {
    'max': np.max,
    'mean': np.mean,
    'median': np.median,
    'min': np.min,
    'std': np.std,
    'var': np.var,
    'kurtosis': scipy.stats.kurtosis,
    'skew': scipy.stats.skew,
    'diff': np.ptp,
    'len': len,
    'density': lambda x: np.ptp(x) / len(x),
}

def fe_statistics(func_table: dict,
                  prefix: str, ds: 'pd.Series') -> 'pd.DataFrame':
    ds = ds.map(lambda ts: eval(ts))
    ret = pd.DataFrame()
    
    for k, func in func_table.items():
        print(prefix + k)
        ret[prefix + k] = ds.map(lambda ts: func_table[k](ts))
        
    return ret

In [9]:
def get_datetime_feature(ds: 'pd.Series') -> 'pd.DataFrame':
    entities = {
        'datetime': (ds.reset_index(), 'index')
    }
    ret, _ = ft.dfs(entities=entities, relationships=[],
                    target_entity='datetime')
    print(list(ret))
    
    return ret.drop(columns='YEAR(Date)')\
                .rename(columns={
                        'DAY(Date)': 'Day',
                        'MONTH(Date)': 'Month',
                        'WEEKDAY(Date)': 'Weekday'
                })

In [10]:
def feature_extraction_login(in_path: str, out_root: str) -> None:
    fl_nm = in_path.split('\\')[-1][:-4] + 'logfe.csv'
    print(fl_nm)
    
    df = dd.read_csv(in_path, compression='zip')\
            .compute()
    
    df_dt = get_datetime_feature(df['Date'])
    df_ts = fe_statistics(func_table, r'ts_', df['Timestamp'])
    df_dr = fe_statistics(func_table, r'dr_', df['Duration'])
    df.drop(columns=['Timestamp', 'Duration'], inplace=True)
    
    pd.concat([df, df_dt, df_ts, df_dr], axis=1)\
        .to_csv(os.path.join(out_root, fl_nm), 
                header=True, index=None, compression='zip')

In [11]:
file_li = [fl for fl in os.listdir(log_root) if 'os' not in fl]

for fl in file_li:
    path = os.path.join(log_root, fl)
    print(path)
    
    feature_extraction_login(path, out_root)

D:\issp_data\pre_log\s35809_MobileApp_201701_104628_0.csv
s35809_MobileApp_201701_104628_0logfe.csv
['DAY(Date)', 'YEAR(Date)', 'MONTH(Date)', 'WEEKDAY(Date)']
ts_max
ts_mean
ts_median
ts_min
ts_std
ts_var
ts_kurtosis
ts_skew
ts_diff
ts_len
ts_density
dr_max
dr_mean
dr_median
dr_min
dr_std
dr_var
dr_kurtosis
dr_skew
dr_diff
dr_len
dr_density
D:\issp_data\pre_log\s35809_MobileApp_201701_104628_1.csv
s35809_MobileApp_201701_104628_1logfe.csv
['DAY(Date)', 'YEAR(Date)', 'MONTH(Date)', 'WEEKDAY(Date)']
ts_max
ts_mean
ts_median
ts_min
ts_std
ts_var
ts_kurtosis
ts_skew
ts_diff
ts_len
ts_density
dr_max
dr_mean
dr_median
dr_min
dr_std
dr_var
dr_kurtosis
dr_skew
dr_diff
dr_len
dr_density
D:\issp_data\pre_log\s35809_MobileApp_201701_104628_2.csv
s35809_MobileApp_201701_104628_2logfe.csv
['DAY(Date)', 'YEAR(Date)', 'MONTH(Date)', 'WEEKDAY(Date)']
ts_max
ts_mean
ts_median
ts_min
ts_std
ts_var
ts_kurtosis
ts_skew
ts_diff
ts_len
ts_density
dr_max
dr_mean
dr_median
dr_min
dr_std
dr_var
dr_kurtosis
dr

dr_std
dr_var
dr_kurtosis
dr_skew
dr_diff
dr_len
dr_density
D:\issp_data\pre_log\s35809_MobileApp_201709_104652_0.csv
s35809_MobileApp_201709_104652_0logfe.csv
['DAY(Date)', 'YEAR(Date)', 'MONTH(Date)', 'WEEKDAY(Date)']
ts_max
ts_mean
ts_median
ts_min
ts_std
ts_var
ts_kurtosis
ts_skew
ts_diff
ts_len
ts_density
dr_max
dr_mean
dr_median
dr_min
dr_std
dr_var
dr_kurtosis
dr_skew
dr_diff
dr_len
dr_density
D:\issp_data\pre_log\s35809_MobileApp_201709_104652_1.csv
s35809_MobileApp_201709_104652_1logfe.csv
['DAY(Date)', 'YEAR(Date)', 'MONTH(Date)', 'WEEKDAY(Date)']
ts_max
ts_mean
ts_median
ts_min
ts_std
ts_var
ts_kurtosis
ts_skew
ts_diff
ts_len
ts_density
dr_max
dr_mean
dr_median
dr_min
dr_std
dr_var
dr_kurtosis
dr_skew
dr_diff
dr_len
dr_density
D:\issp_data\pre_log\s35809_MobileApp_201709_104652_2.csv
s35809_MobileApp_201709_104652_2logfe.csv
['DAY(Date)', 'YEAR(Date)', 'MONTH(Date)', 'WEEKDAY(Date)']
ts_max
ts_mean
ts_median
ts_min
ts_std
ts_var
ts_kurtosis
ts_skew
ts_diff
ts_len
ts_density


In [12]:
path = r'D:\issp_data\fe_log_new\s35809_MobileApp_201701_104628_0logfe.csv'
df = dd.read_csv(path, compression='zip')\
        .compute()
df

Unnamed: 0,Monitor_ID,Date,Day,Month,Weekday,ts_max,ts_mean,ts_median,ts_min,ts_std,...,dr_mean,dr_median,dr_min,dr_std,dr_var,dr_kurtosis,dr_skew,dr_diff,dr_len,dr_density
0,50000010367,2017-01-05,5,1,3,1483658572,1.483615e+09,1.483618e+09,1483577852,26569.735436,...,125.194690,16.0,1,599.475506,3.593709e+05,88.227162,9.181399,6158,113,54.495575
1,50000011091,2017-01-02,2,1,0,1483348718,1.483349e+09,1.483349e+09,1483348602,40.840135,...,15.166667,13.0,1,11.809836,1.394722e+02,-0.625054,0.684473,36,6,6.000000
2,50000011091,2017-01-16,16,1,0,1484552063,1.484552e+09,1.484552e+09,1484551003,358.379464,...,26.000000,7.0,1,35.021898,1.226533e+03,1.382080,1.594719,119,15,7.933333
3,50000011730,2017-01-25,25,1,2,1485381088,1.485360e+09,1.485370e+09,1485329765,19049.690746,...,55.000000,10.0,10,75.967445,5.771053e+03,5.074185,2.301367,310,19,16.315789
4,50000011763,2017-01-23,23,1,0,1485209843,1.485188e+09,1.485185e+09,1485161078,15021.548332,...,417.176471,186.0,27,581.437897,3.380700e+05,2.100940,1.860818,2020,17,118.823529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316472,50001841730,2017-01-05,5,1,3,1483651711,1.483615e+09,1.483616e+09,1483577863,22218.174164,...,501.142857,12.0,1,1312.745044,1.723300e+06,7.759890,3.039919,5097,14,364.071429
316473,50001841970,2017-01-29,29,1,6,1485729375,1.485695e+09,1.485690e+09,1485674463,14757.261257,...,27.484615,4.0,1,60.526947,3.663511e+03,18.366679,3.922053,429,130,3.300000
316474,50001842358,2017-01-23,23,1,0,1485209017,1.485176e+09,1.485178e+09,1485150603,18608.005071,...,84.131579,14.0,1,149.069954,2.222185e+04,7.461952,2.646504,806,76,10.605263
316475,50001842551,2017-01-23,23,1,0,1485208600,1.485179e+09,1.485186e+09,1485143727,23591.244338,...,323.297297,40.0,3,640.213725,4.098736e+05,13.356099,3.470953,3467,37,93.702703
