In [1]:
from typing import List, Optional, Generator
from datetime import datetime, timezone, timedelta
import random
import uuid
import hashlib
import os

import numpy as np
import pandas as pd
from dask.distributed import Client
from pathlib import Path
import dask.dataframe as dd
from dask import delayed


In [2]:
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:41121  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 33.54 GB


In [3]:
def calculate(df, count_cols, stat_cols, by):
    counts = count_distinct_values(df, count_cols, by)
    stats = num_stats(df, stat_cols, by)
    right = counts.merge(stats)
    df = df.merge(right,
                  left_on=[df.index, 'order_id'], 
                  right_on=[by, 'order_id'],
                  suffixes=('', f's_by_{by}'))
    return df
        

def count_distinct_values(df, cols, by):
    results = []
    for col in cols:
        results.append( 
            df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: len(set(x))))
        
    counts = results[0].to_frame()
    for col, result in zip(cols[1:], results[1:]):
        counts[col] = result

    counts = counts.reset_index()[[by, 'order_id', *cols]]
    return counts

def num_stats(df, cols, by):
    results = []
    for col in cols:
        results.append(df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: (np.mean(x), np.std(x), np.min(x), np.max(x), np.sum(x))))
        
    stats = results[0].to_frame()
    for col, result in zip(cols[1:], results[1:]):
        stats[col] = result
    
    stats = stats.reset_index()[[by, 'order_id', *cols]]
    for col in cols:
        stats[f'{col}_mean_by_{by}'] = stats[col].apply(lambda x: x[0])
        stats[f'{col}_std_by_{by}'] = stats[col].apply(lambda x: x[1])
        stats[f'{col}_min_by_{by}'] = stats[col].apply(lambda x: x[2])
        stats[f'{col}_max_by_{by}'] = stats[col].apply(lambda x: x[3])
        stats[f'{col}_sum_by_{by}'] = stats[col].apply(lambda x: x[4])
        stats[col] = stats[col].apply(str)
    
    return stats


In [4]:
df1 = dd.read_parquet(f'./data/features/customer_id/', engine='fastparquet')
df2 = dd.read_parquet(f'./data/features/email/', engine='fastparquet')


#df = df.map_partitions(calculate, count_cols=['email', 'ip_addr'], stat_cols=['order_amount'], by='customer_id')
#df.to_parquet(f'{data_path}/features/customer_id/')
#df.head()

  Numpy8 = numba.jitclass(spec8)(NumpyIO)
  Numpy32 = numba.jitclass(spec32)(NumpyIO)
  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [5]:
df = df1.merge(df2, left_on='order_id', right_on='order_id', suffixes=('', '_'))

In [6]:
num_folds = 10
subsample_not_fraud = 0.2
df['is_fraud'] = ((df.emails_by_customer_id + df.ip_addrs_by_customer_id) > 12) | \
    ((df.order_amount > 1.1 * df.order_amount_mean_by_email) & (df.emails_by_customer_id > 5))
df['is_fraud'] = df['is_fraud'].apply(lambda x: int(x & (random.random() > 0.5)), meta=('is_fraud', int))
df['sample'] = df['is_fraud'].apply(lambda x: int((x > 0) | (random.random() > (1.-subsample_not_fraud))), meta=('sample', int))
df['fold'] = df['is_fraud'].apply(lambda x: random.randint(0, num_folds), meta=('fold', int)) 

In [7]:
df_sample = df[df['sample'] > 0]
df_sample = df_sample.set_index('fold')
df_sample.to_parquet('./data/labeled/sampled/')