In [1]:
# Joey's first pass
import pandas as pd
from random import randint
import numpy as np

def find_duplicates(x):
    bool_list = []
    for i in range(len(x)):
        if i == 0: 
            bool_list.append(True)
        else:
            if x[i] == 1 and x[i-1] == 1:
                bool_list.append(False) 
            else:
                bool_list.append(True)
    return bool_list

churn_data = {"Cust" : [477 for i in range(10)], "flag": [0,0,1,1,0,1,0,1,1,1]}
df = pd.DataFrame(data=churn_data,index=pd.date_range(start='1/1/2018', end='1/10/2018'))
df.loc[:,'duplicate_flg'] = find_duplicates(df.flag)

In [2]:
# take 2 - scaling up!  How do you deal with multiple customers, production dataset is 2m rows!
def build_rand_dataset(max_num_custs=10000, max_rec_per_cust=200):
    # 500 customers with max of 24 rows will build 12k rows dataset
    dataset = []
    for cust in range(0,max_num_custs):
        # create fake customer number that is always 7 digits
        cust_num = str(randint(0,1000)).zfill(7)
        # create random number of records
        num_records = randint(0,max_rec_per_cust)
        # build fake customer data
        data = {'cust_num': [cust_num for i in range(0,num_records)],
                'dates': pd.date_range(periods=num_records,freq='D',start='2021-01-01'),
                'flag': [randint(0,1) for x in range(0,num_records)]}
        # build dataframe & sotre
        dataset.append(pd.DataFrame(data))
    return pd.concat(dataset)

In [None]:
# creating random dataset - max of 2 million rows (10,000 cust & 200 records each)
test_data = build_rand_dataset()
test_data

In [None]:
# lagging flag column and partitioning by cust_num to allow for row comparison; 
# imputing nan in lagged column with 0 since it's first value of partitions, so it will never be a dupe
test_data['lagged_flag'] = test_data.groupby(['cust_num'])['flag'].shift(1)
test_data['lagged_flag'].fillna(0,inplace=True)
test_data

In [None]:
# converting flag and lagged_flag columns to vectors for faster runtime when comparing instead of looping
flag = np.array(test_data['flag'])
lagged_flag = np.array(test_data['lagged_flag'])
lagged_flag, flag

In [None]:
# if their sum is 2, then both must be 1, and anything else would be a 0-0, 1-0, or 0-1 combination
# so we remove all rows with a sum of 2 for these columns
test_data = test_data[flag + lagged_flag < 2]
test_data