## Task
1. Identify all shops that are deemed to have conducted order brushing.
2. For each shop that is identified to have conducted order brushing, identify the buyers suspected to have conducted order brushing for that shop.

Definition of order brushing
- concentration_rate >= 3
- concentration_rate = num_orders_1hr / num_unique_buyer_1hr
- **suspicious buyers** are deemed to be the buyer that contributed the highest proportion of orders to a shop

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [25]:
df = pd.read_csv('order_brush_order.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222750 entries, 0 to 222749
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   orderid     222750 non-null  int64 
 1   shopid      222750 non-null  int64 
 2   userid      222750 non-null  int64 
 3   event_time  222750 non-null  object
dtypes: int64(3), object(1)
memory usage: 6.8+ MB


In [5]:
df.describe()

Unnamed: 0,orderid,shopid,userid
count,222750.0,222750.0,222750.0
mean,31300270000000.0,94331170.0,98028800.0
std,122277400000.0,56957900.0,68390480.0
min,31075200000000.0,10009.0,10007.0
25%,31203600000000.0,49802670.0,35081270.0
50%,31305610000000.0,90336360.0,93096250.0
75%,31406040000000.0,147505300.0,159061200.0
max,31507200000000.0,215435200.0,215526200.0


In [6]:
df.event_time.min(), df.event_time.max()

('2019-12-27 00:00:00', '2019-12-31 23:59:56')

In [31]:
df['event_time'] = df['event_time'].astype('datetime64[ns]')
df = df.set_index('event_time').sort_index()

### Trying to define whether order brushing has occurred

> Detect instataneous concentration rate spikes

In [79]:
df.head()

Unnamed: 0_level_0,orderid,shopid,userid
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-12-27 00:00:00,31075200506751,6042309,97707522
2019-12-27 00:00:00,31075200506752,104804492,97707522
2019-12-27 00:00:00,31075200506753,8715449,97707522
2019-12-27 00:00:02,31075201870570,190969466,170182475
2019-12-27 00:00:05,31075205798264,2859407,12532131


In [90]:
def nunique(arr):
    return len(set(arr))

orders_shop_1h = df.groupby('shopid').rolling('1h', min_periods=3).orderid.count()
buyers_shop_1h = df.groupby('shopid').rolling('1h', min_periods=3).userid.apply(nunique)

In [91]:
orders_shop_1h

shopid     event_time         
10009      2019-12-27 03:06:50   NaN
10051      2019-12-27 19:16:11   NaN
           2019-12-29 01:56:19   NaN
10061      2019-12-28 09:27:55   NaN
           2019-12-28 12:05:32   NaN
                                  ..
214949521  2019-12-31 20:06:43   NaN
214964814  2019-12-29 22:26:16   NaN
215175775  2019-12-31 09:06:31   NaN
           2019-12-31 14:14:37   NaN
215435223  2019-12-31 18:34:56   NaN
Name: orderid, Length: 222750, dtype: float64

In [132]:
concentration = orders_shop_1h / buyers_shop_1h
concentration_spikes = concentration >= 1.5
concentration_events = concentration_spikes.index[concentration_spikes].values

In [133]:
concentration_events

array([(10084, Timestamp('2019-12-28 22:34:49')),
       (10151, Timestamp('2019-12-29 02:29:23')),
       (10151, Timestamp('2019-12-30 21:14:46')), ...,
       (213141071, Timestamp('2019-12-27 01:42:23')),
       (213900783, Timestamp('2019-12-29 23:04:33')),
       (214432425, Timestamp('2019-12-31 23:46:35'))], dtype=object)

In [134]:
events = []

for shopid, timestamp in concentration_events:
    event_start = timestamp - pd.Timedelta('1 hours')
    event_end = timestamp
    event = df[(df['shopid'] == shopid) & (df.index >= event_start) & (df.index <= event_end)]
    events.append(event)
    
df_events = pd.concat(events)

df_events.shape

(9039, 3)

In [135]:
df_events.head()

Unnamed: 0_level_0,orderid,shopid,userid
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-12-28 21:55:12,31240511163713,10084,167932181
2019-12-28 22:29:57,31242597715742,10084,180772892
2019-12-28 22:34:49,31242888812765,10084,180772892
2019-12-29 02:00:32,31255231246682,10151,43528657
2019-12-29 02:25:31,31256731116451,10151,91471576


In [142]:
def extract_common_value(x):
    mode_list = x.mode()
    if len(mode_list) <= 1:
        return mode_list[0]
    else:
        return '&'.join([str(c) for c in sorted(mode_list)])

In [143]:
order_brush = df_events.groupby('shopid')['userid'].apply(extract_common_value)
order_brush = order_brush.to_frame()

In [144]:
order_brush.sample(10)

Unnamed: 0_level_0,userid
shopid,Unnamed: 1_level_1
11284464,126654586
111184668,16479121
111674507,74505142
133312433,16798544
131393410,61285147
107234749,194625392
64881881,75582248
166488971,86341361
19216692,193338089
132716330,55375223


In [145]:
order_brush.shape

(1307, 1)

### Preparing for Submission

In [140]:
def make_new_submission(num, order_brush):
    shops_all = df['shopid'].unique()

    submission = pd.DataFrame({'shopid': shops_all, 'userid': np.zeros(len(shops_all))})

    submission.userid = submission.userid.astype('int')
    submission.set_index('shopid', inplace=True)
    submission.update(order_brush)
    submission = submission.reset_index()
    
    submission.to_csv('submission{}.csv'.format(num), index=False)
    return submission

In [146]:
new_submission = make_new_submission(8, order_brush)

new_submission.shape

(18770, 2)