In [1]:
import os
import dask.dataframe as dd
import pandas as pd
import numpy as np

In [2]:
log_root = r'D:\issp_data\raw_tsdr'
out_root = r'D:\issp_data\pre_log'
profile_path = r'D:\issp_data\zok_ssp_2017.csv'

In [3]:
id_list = dd.read_csv(profile_path, usecols=[r'モニターID(11桁)'], 
                      encoding='shift-jis').compute().values.reshape((-1, ))

In [4]:
def preprocess_login(in_path: str, out_root: str) -> None:
    usecols = [
        r'モニターCue ※納品不可', 
        r'日時(yyyy-mm-dd hh:mm:ss)',
        r'接触時間（duration)',
        r'OS（Android or iOS）',
        r'アプリカテゴリ'
    ]
    columns_jp2en = {
        r'モニターCue ※納品不可': r'Monitor_ID',
        r'日時(yyyy-mm-dd hh:mm:ss)': r'Timestamp',
        r'接触時間（duration)': r'Duration',
        r'OS（Android or iOS）': r'OS',
        r'アプリカテゴリ': r'APP_Cate'
    }
    app_cata = {
        r'ツール類': 0,
        r'ソーシャルネットワーキング': 1,
        r'ゲーム': 2
    }
    mobile_os = {
        r'iOS': 0,
        r'Android': 1
    }
    
    print(f'>> {in_path}')
    
    # step 1: read dataframe
    # read dataframe from HDD, and only select part of columns
    # rename dataframe columns from japanese to english
    df = dd.read_csv(in_path, sep='\t', usecols=usecols)\
            .rename(columns=columns_jp2en)  
    
    # step 2: filter/remove the data we need/don't need
    # remove non recorded user
    df = df[df[r'Monitor_ID'].isin(set(id_list))]
    # remove non top-3-categories rows
    # remove those rows with any missing value
    df = df[df[r'APP_Cate'].isin(set(app_cata.keys()))]\
            .dropna()
    
    # replace category name (str) with label number (int)
    df['APP_Cate'] = df['APP_Cate'].map(lambda cata: app_cata[cata], 
                                        meta=('APP_Cate', int))
    # replace os name (str) with number (int)
    df['OS'] = df['OS'].map(lambda os: mobile_os[os], 
                            meta=('OS', int))
    
    # step 3: create the columns/data we need, or transform datatype, etc
    # generate date columns
    df['Date'] = df['Timestamp'].map(lambda x: x[:10], meta=('Date', str))
    # convert time (str dtype) to timestamp (int dtype), then divide 10^9
    df['Timestamp'] = df['Timestamp']\
        .astype('M8[us]')\
        .astype(np.int64) // 10 ** 9
    
    # step 4: generate ret data as result
    # split original df into 3 categories
    dfc_ts, dfc_dr = {}, {}
    f = lambda x: x.tolist()
    fl_nm = in_path.split('\\')[-1]
    
    print(r'user_os_list')
    df.drop_duplicates(subset=['Monitor_ID'])\
        .compute()\
        .drop(columns=[r'Timestamp', r'Duration', r'APP_Cate', r'Date'])\
        .to_csv(os.path.join(out_root, f'{fl_nm[:-4]}_os.csv'), 
                header=True, index=None, compression='zip')
    
    for k, i in app_cata.items():
        _dfc = df[df['APP_Cate'] == i]
        _dfc_gr = _dfc.groupby(['Monitor_ID', 'Date'])
        
        print(r'timestamp', i)
        ts = _dfc_gr['Timestamp']\
                .apply(f, meta=('Timestamp', int))\
                .compute()     
        print(r'duration', i)
        dr = _dfc_gr['Duration']\
                .apply(f, meta=('Duration', int))\
                .compute()
        
        pd.concat([ts, dr], axis=1)\
            .to_csv(os.path.join(out_root, f'{fl_nm[:-4]}_{i}.csv'), 
                    header=True, compression='zip')

In [5]:
for file_nm in os.listdir(log_root):
    path = os.path.join(log_root, file_nm)
    preprocess_login(path, out_root)

>> D:\issp_data\raw_tsdr\s35809_MobileApp_201701_104628.tsv
user_os_list


KeyboardInterrupt: 

In [None]:
df = pd.read_csv(r'D:\issp_data\pre_log\s35809_MobileApp_201701_104628_os.csv',
                 compression='zip')
df