In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta

## Functions

In [2]:
# Get event time of previous order from the same shopid
def get_previous(data, shopid, event_time):
    
    # Get all previous orders
    previous = data.loc[
        (data['shopid'] == shopid) & (data['event_time'] < event_time),
        'event_time'
    ]
    
    # Take the last order
    if len(previous) > 0:
        previous = pd.Timestamp(previous.values[-1])
    else:
        previous = None
    
    
    return(previous)

In [3]:
# Get a subset of data
def get_subset(data, shopid, start_time, end_time):
    
    # Get the subset
    subset = data.loc[
        (data['shopid'] == shopid) 
        & (data['event_time'] >= start_time)
        & (data['event_time'] <= end_time)
    ]
    
    
    return(subset)

In [4]:
# Get concentrate rate
def get_rate(data):
    
    # Calculate concentrate rate
    if data.shape[0] > 0:
        rate = data.shape[0] / len(np.unique(data['userid']))
    else:
        rate = 0
    
    
    return(rate)

In [5]:
# Get suspicious orders
def get_orders(data, output):
    
    # Get max orders
    order_agg = data.groupby('userid')['orderid'].count().reset_index()
    order_max = order_agg['orderid'].max()

    # Get userid with max number of orders
    users = order_agg.loc[order_agg['orderid']==order_max, 'userid'].unique()

    # Get orders from suspicious userid
    orders = data \
        .loc[data['userid'].isin(users), ['shopid', 'userid', 'orderid']] \
        .values \
        .tolist()
    output = output + orders


    return(output)

In [6]:
# Find consecutive orders from the same userid
def find_consecutive(data, output, userid, previous_time):
    
    # Check consecutive orders
    count = 0
    total_orders = data.shape[0]
    if total_orders > 3:
        while count < total_orders and data.iloc[count, 2] == userid:
            count = count + 1

    # If the number of orders is equal or higher than 3
    if count >= 3:

        # Check the time difference of the next and previous order
        next_order = data.iloc[count, 3]
        if pd.isnull(previous_time) == False:
            delta = next_order - previous_time

        # Get orders from suspicious userid
        if pd.isnull(previous_time) or delta > timedelta(hours=1):
            orders = data \
            .iloc[:count, [1, 2, 0]] \
            .values \
            .tolist()
            output = output + orders
    
    
    return(output)

In [7]:
# Find order brushing
def find_brushing(data, row):
    
    # Initialize output
    output = []
    
    # Time period based on time event
    subset = get_subset(data, row[1], row[3], row[4])
    rate = get_rate(subset)
    if rate >= 3:
        output = get_orders(subset, output)
    else:
        # Find consecutive orders
        output = find_consecutive(subset, output, row[2], row[5])

    # Time period based on time event of previous record
    if pd.isnull(row[6]) == False:
        subset = get_subset(data, row[1], row[3], row[6])
        rate = get_rate(subset)
        if rate >= 3:
            output = get_orders(subset, output)
    
    # Show log
    if (row.name+1) % 10000 == 0:
        print(str(row.name+1), 'rows checked')
    
    
    return(output)

## Data Understanding

In [8]:
# Load the dataset
dataset = pd.read_csv('order_brush_order.csv')

In [9]:
# Show sample dataset
dataset.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [10]:
# Show more information about the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222750 entries, 0 to 222749
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   orderid     222750 non-null  int64 
 1   shopid      222750 non-null  int64 
 2   userid      222750 non-null  int64 
 3   event_time  222750 non-null  object
dtypes: int64(3), object(1)
memory usage: 6.8+ MB


In [11]:
# Check number of unique shopid
# (should be the total number of rows for the submission file)
len(set(dataset.shopid.tolist()))

18770

## Data Preparation

In [12]:
# Convert to datetime
dataset.event_time = pd.to_datetime(
    dataset.event_time, 
    format='%Y-%m-%d %H:%M:%S', 
    errors='coerce'
)

In [13]:
# Sort the dataset by shopid and event time
dataset = dataset.sort_values(by=['shopid', 'event_time'])
dataset = dataset.reset_index(drop=True)

In [14]:
# Add new column that indicate the next hour
dataset['end_interval_1'] = dataset.event_time + timedelta(hours=1)

In [15]:
# Add new column that indicate the next hour from previous order
dataset['previous_time'] = dataset.apply(
    lambda row: get_previous(dataset, row['shopid'], row['event_time']), 
    axis=1
)
dataset['end_interval_2'] = dataset.previous_time + timedelta(hours=1)

In [16]:
# Show the dataset
dataset.head()

Unnamed: 0,orderid,shopid,userid,event_time,end_interval_1,previous_time,end_interval_2
0,31086409141107,10009,196962305,2019-12-27 03:06:50,2019-12-27 04:06:50,NaT,NaT
1,31144571933461,10051,2854032,2019-12-27 19:16:11,2019-12-27 20:16:11,NaT,NaT
2,31254979546679,10051,48600461,2019-12-29 01:56:19,2019-12-29 02:56:19,2019-12-27 19:16:11,2019-12-27 20:16:11
3,31195675919209,10061,168750452,2019-12-28 09:27:55,2019-12-28 10:27:55,NaT,NaT
4,31205132327893,10061,194819216,2019-12-28 12:05:32,2019-12-28 13:05:32,2019-12-28 09:27:55,2019-12-28 10:27:55


## Analysis

In [17]:
# Find order brushing
brushing = dataset.apply(lambda row: find_brushing(dataset, row), axis=1)
brushing = [lists for list_of_lists in brushing for lists in list_of_lists]

10000 rows checked
20000 rows checked
30000 rows checked
40000 rows checked
50000 rows checked
60000 rows checked
70000 rows checked
80000 rows checked
90000 rows checked
100000 rows checked
110000 rows checked
120000 rows checked
130000 rows checked
140000 rows checked
150000 rows checked
160000 rows checked
170000 rows checked
180000 rows checked
190000 rows checked
200000 rows checked
210000 rows checked
220000 rows checked


In [18]:
# Remove duplication and aggregate
brushing = pd.DataFrame(brushing, columns=['shopid', 'userid', 'orderid'])
brushing = brushing.drop_duplicates()
brushing = brushing.groupby(['shopid', 'userid']).count().reset_index()

In [19]:
# Get suspicious buyers
highest = brushing[['shopid', 'orderid']].groupby(['shopid']).max()
suspicious = pd.merge(brushing, highest, on=['shopid', 'orderid'], how='inner')

In [20]:
# Form the final result
suspicious = suspicious.sort_values(by=['shopid', 'orderid'])
suspicious.userid = suspicious.userid.astype('str')
result = suspicious.groupby('shopid')['userid'].apply('&'.join).reset_index()

In [21]:
# Add shopid without brushing orders
shopid_all = list(set(dataset.shopid.tolist()))
shopid_clean = pd.DataFrame(shopid_all, columns=['shopid'])
result = pd.merge(shopid_clean, result, on='shopid', how='left')
result = result.fillna(0)

In [22]:
# Export the result to csv
result.to_csv('submission.csv', index=False)