In [1]:
import pandas as pd

## Data Understanding

In [2]:
# Load the dataset
dataset = pd.read_csv('order_brush_order.csv')

In [3]:
# Show sample dataset
dataset.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [4]:
# Show more information about the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222750 entries, 0 to 222749
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   orderid     222750 non-null  int64 
 1   shopid      222750 non-null  int64 
 2   userid      222750 non-null  int64 
 3   event_time  222750 non-null  object
dtypes: int64(3), object(1)
memory usage: 6.8+ MB


In [5]:
# Check number of unique shopid
# (should be the total number of rows for the submission file)
len(set(dataset.shopid.tolist()))

18770

# Data Preparation

In [6]:
# Convert to datetime
dataset.event_time = pd.to_datetime(
    dataset.event_time, 
    format='%Y-%m-%d %H:%M:%S', 
    errors='coerce'
)

In [7]:
# Extract date and hour
dataset['date'] = dataset.event_time.dt.date
dataset['hour'] = dataset.event_time.dt.hour

In [8]:
# Show the result
dataset.head()

Unnamed: 0,orderid,shopid,userid,event_time,date,hour
0,31076582227611,93950878,30530270,2019-12-27 00:23:03,2019-12-27,0
1,31118059853484,156423439,46057927,2019-12-27 11:54:20,2019-12-27,11
2,31123355095755,173699291,67341739,2019-12-27 13:22:35,2019-12-27,13
3,31122059872723,63674025,149380322,2019-12-27 13:01:00,2019-12-27,13
4,31117075665123,127249066,149493217,2019-12-27 11:37:55,2019-12-27,11


# Analysis

In [9]:
# Get number of order per shopid, hour, and userid
agg = dataset[['shopid', 'date', 'hour', 'userid', 'orderid']].groupby(['shopid', 'date', 'hour', 'userid']).count().reset_index()
agg.head()

Unnamed: 0,shopid,date,hour,userid,orderid
0,10009,2019-12-27,3,196962305,1
1,10051,2019-12-27,19,2854032,1
2,10051,2019-12-29,1,48600461,1
3,10061,2019-12-28,9,168750452,1
4,10061,2019-12-28,12,194819216,1


In [10]:
# Convert column type to string
agg['userid']= agg['userid'].astype(str)

In [11]:
# Get list of unique shopid 
shopid_unique = list(set(dataset.shopid.tolist()))
shopid_unique.sort()

# Export to dataframe
result = pd.DataFrame(shopid_unique, columns =['shopid'])
result.head()

Unnamed: 0,shopid
0,10009
1,10051
2,10061
3,10084
4,10100


In [12]:
# Add the second column
userid = ['0'] * len(shopid_unique)
result['userid'] = userid
result.head()

Unnamed: 0,shopid,userid
0,10009,0
1,10051,0
2,10061,0
3,10084,0
4,10100,0


In [13]:
# Get list of suspected shopid
shopid_brushing = list(set(agg.loc[agg.orderid >= 3, 'shopid']))

In [14]:
# Get the suspected userid
userid_brushing = []
for shopid in shopid_brushing:
    userid = agg.loc[(agg.shopid == shopid) & (agg.orderid >= 3), 'userid'].tolist()
    userid = "&".join(userid)
    userid_brushing.append(userid)

In [15]:
# Create dataframe for suspected data
brushing = pd.DataFrame(list(zip(shopid_brushing, userid_brushing)), columns=['shopid', 'userid'])

In [16]:
# Create the final dataframe
final = pd.merge(result, brushing, on='shopid', how='left')
final.head()

Unnamed: 0,shopid,userid_x,userid_y
0,10009,0,
1,10051,0,
2,10061,0,
3,10084,0,
4,10100,0,


In [17]:
# Clean up
final = final[['shopid', 'userid_y']]
final.columns = ['shopid', 'userid']
final = final.fillna(0)

In [20]:
# Export to csv
final.to_csv('submission.csv', index=False)