In [None]:
## Import pandas as pd and numpy as np and read csv file

In [32]:
import pandas as pd
import numpy as np

dataBin = pd.read_csv('order_brush_order.csv')

**Process:** Parsing Time

In [3]:
from datetime import datetime

In [33]:
dFormat = '%Y-%m-%d %H:%M:%S'
dataBin['date'] = dataBin['event_time'].apply(lambda x: int( datetime.strptime(x, dFormat).strftime("%d")))
dataBin['hour'] = dataBin['event_time'].apply(lambda x: int( datetime.strptime(x, dFormat).strftime("%H")))
dataBin['minute'] = dataBin['event_time'].apply(lambda x: int( datetime.strptime(x, dFormat).strftime("%M")))

**Process:** Drop the event time, the year(since all of them are 2019), the month (since all of them are 12) and the orderid(they are insignificant)

In [5]:
dataBin = dataBin.drop(['event_time', 'orderid'], axis='columns')

In [6]:
dataBin

Unnamed: 0,shopid,userid,date,hour,minute
0,93950878,30530270,27,0,23
1,156423439,46057927,27,11,54
2,173699291,67341739,27,13,22
3,63674025,149380322,27,13,1
4,127249066,149493217,27,11,37
...,...,...,...,...,...
222745,110868129,193218997,28,23,17
222746,91639906,3541807,28,19,7
222747,29391773,135908070,28,8,17
222748,61556313,27306783,28,10,14


**Overview:**
Since the logic of the code is to iterate over all shops over the scope of time, we need to get a list of all the unique shops

In [29]:
listShop = dataBin['shopid'].unique()

**Optimization:** Since order brushing happens when order to buyer ratio is 3 or above. All shops who have a total of 3 transaction below are impossible to have conducted order brushing. So, we can filter all these shops

In [30]:
def shopFilter(listShop):
    shopCounter = dataBin.groupby('shopid')['shopid'].agg('count')
    
    shopsQualified = dict()
    
    for shop in listShop:
        if (shopCounter[shop] >= 3):
            shopsQualified[shop] = shopCounter[shop]
            
    return shopsQualified

In [31]:
listShopToCheck = shopFilter(listShop)

**Process:** Now, the real chunk of code, Order Brushing Logic

    1. Iterate over all shops
    2. Sort the time and date of the selected shop
    3. Iterate over hours
        a. Count all the transaction at that hour
        b. Count all the unique buyer at that hour
        c. If ratio of transaction to buyer is 3 above, mark as Order Brushing
            a1. If order brushing, get the suspect buyer(s) of the period and put in a list
    4. After looping through out, check the list for the buyer's most prevalent order brusher

In [79]:
def getBestOrderBrusher(brushersList):
    
    brush_counter = dict()
    
    for brusher in brushersList:
        if (brusher in brush_counter):
            brush_counter[brusher] += 1
        else:
            brush_counter[brusher] = 1
    
    best_count = 0
    best_brusher = list()
    
    for brusher in brush_counter:
        if (brush_counter[brusher] > best_count):
            best_brusher = list()
            best_brusher.append(brusher)
            best_count = brush_counter[brusher]
        elif (brush_counter[brusher] == best_count):
            best_brusher.append(brusher)
            
    return best_brusher
            

In [80]:
def getOrderBrusher(userNames, userCounts):
    brushers_list = list()
    for name in userNames:
        if (userCounts[name] >= 3):
            brushers_list.append(name)
            
    return brushers_list
            

In [81]:
def checkOrderBrushing(realtimeCheck = False):
    orderBrushers = dict()
    
    for shop in listShopToCheck:
        
        curShop = dataBin[dataBin['shopid'] == shop]
        curShop = curShop.sort_values(by = ['date', 'hour', 'minute'])
        
        orderBrushers[shop] = list()
        
        for day in curShop['date'].unique():
            x2 = curShop[curShop['date'] == day]
            
            for hour in x2['hour'].unique():
                x3 = x2[x2['hour'] == hour]
                
                transactions = x3.shape[0]
                unique_buyers = x3['userid'].unique()
                
                if(transactions // len(unique_buyers) >= 3):
                    if (realtimeCheck): 
                        print("Shopid: ", shop, " data: " , transactions, len(unique_buyers))
                        
                    unique_buyers_list = x3.groupby('userid')['userid'].agg('count')
                    
                    orderBrushers[shop] = orderBrushers[shop] + getOrderBrusher(unique_buyers, unique_buyers_list)
                    
        orderBrushers[shop] = getBestOrderBrusher(orderBrushers[shop])
        
        
    return orderBrushers
        

In [83]:
brusherSearchResult = checkOrderBrushing()

**Process:** We put the brusherSearchResult in the dummy dataBin along with all the shops list

In [164]:
dataBin2 = pd.DataFrame()

In [165]:
dataBin2['shopid'] = [names for names in brusherSearchResult]

In [166]:
dataBin2['userid_raw'] = [brusherSearchResult[names] for names in brusherSearchResult]

In [167]:
def userIDFormatWriter(List):
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    if (len(List) != 0):
        for ele in List:  
            str1 += str(ele)
            str1 += "&"
        
        str1 = str1[:len(str1)-1]
    
    else:
        str1 = 0
    
    # return string   
    return str1  

In [169]:
dataBin2['userid'] = dataBin2['userid_raw'].apply(lambda x: userIDFormatWriter(x))

In [170]:
dataBin2 = dataBin2.drop(['userid_raw'], axis='columns')

In [171]:
dataBinFinal = pd.DataFrame()

In [172]:
dataBinFinal['shopid'] = listShop

In [173]:
dataBinFinal = pd.merge(dataBinFinal, dataBin2, how='left')

In [174]:
dataBinFinal['userid'] = dataBinFinal['userid'].fillna(0)

In [175]:
dataBinFinal

Unnamed: 0,shopid,userid
0,93950878,0
1,156423439,0
2,173699291,0
3,63674025,0
4,127249066,0
...,...,...
18765,163337224,0
18766,772599,0
18767,50236030,0
18768,203587596,0


In [176]:
dataBinFinal.to_csv("OrderBrushingTry1.csv", index=False)