In [8]:

import csv 
from datetime import datetime
import re
import timeit
import bisect 
import math

HEADER_NAMES = ['CMTE_ID', 'AMNDT_IND', 'RPT_TP', 'TRANSACTION_PGI', 'IMAGE_NUM',
                        'TRANSACTION_TP', 'ENTITY_TP', 'NAME', 'CITY', 'STATE',
                        'ZIP_CODE', 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_DT', 'TRANSACTION_AMT',
                        'OTHER_ID', 'TRAN_ID', 'FILE_NUM', 'MEMO_CD', 'MEMO_TEXT,SUB_ID']
REPEAT_DICT = {}
VALUES_DICT = {}

timeit.timeit(main, number=1)

0.008564899357953237

In [1]:
def main():
    with open('percentile.txt') as perc:
        percentile = int(perc.read())
    with open('itcont.txt','r') as records, open('repeat_donors.txt', 'w') as output:
        reader = csv.DictReader(records, fieldnames=HEADER_NAMES, delimiter='|')
        writer = csv.writer(output, delimiter='|')
        for row in reader:
            raw_data = [row['CMTE_ID'], row['NAME'], row['ZIP_CODE'], row['TRANSACTION_DT'], row['TRANSACTION_AMT'], row['OTHER_ID']]
            if check_valid_input(raw_data):
                is_repeat = check_repeat(raw_data)
                if is_repeat:
                    record = add_to_outputs(raw_data, percentile)
                    writer.writerow(record)

In [2]:
def check_valid_input(record):
    if record[5]:
        return False
    if not record[0] or not record[1] or not record[2] or not record[3] or not record[4]:
        return False
    if re.search(r'\d', record[1]):
        return False
    record[2] = record[2][:5]
    if len(record[2]) < 5:
        return False
    try:
        record[3] = datetime.strptime(record[3], '%m%d%Y')
    except ValueError:
        return False
    return True

In [3]:
def check_repeat(record):
    key = (record[1], record[2])
    date_exists = REPEAT_DICT.get(key, 0)
    if not date_exists or record[3] < date_exists:
        REPEAT_DICT[key] = record[3]
        return False
    else:
        return True

In [7]:
# Append sort 
def add_to_outputs(record, percentile):
    key = (record[0], record[2], datetime.strftime(record[3], '%Y'))
    output_exists = VALUES_DICT.get(key, 0)
    if not output_exists:
        new_value = [[record[4]], int(record[4]), 1]
        VALUES_DICT[key] = list(new_value)
        out = list(key)
        new_value[0] = record[4]
        out.extend(new_value)
        return out
    else:
        curr_value = VALUES_DICT[key]
        curr_value[0].append(record[4])
        curr_value[0].sort()
        update_value = [curr_value[0], curr_value[1]+int(record[4]), curr_value[2]+1]
        VALUES_DICT[key] = list(update_value)
        out = list(key)
        update_value[0] = calc_and_format_output(update_value[0], update_value[2], percentile)
        out.extend(update_value)
        return out

In [5]:
def calc_and_format_output(arr, length, percentile):
    index = math.ceil(percentile * length * 0.01)
    num = arr[index-1]
    return num    

In [6]:
# Bisect module/algorithm alternative
def add_to_outputs_bis(record):
    key = (record[0], record[2], datetime.strftime(record[3], '%Y'))
    output_exists = VALUES_DICT.get(key, 0)
    if not output_exists:
        new_value = [[record[4]], int(record[4]), 1]
        VALUES_DICT[key] = list(new_value)
        out = list(key)
        out.extend(new_value)
        return out
    else:
        curr_value = VALUES_DICT[key]
        bisect.insort(curr_value[0], record[4])
        update_value = [curr_value[0], curr_value[1]+int(record[4]), curr_value[2]+1]
        VALUES_DICT[key] = list(update_value)
        out = list(key)
        out.extend(update_value)
        return out

In [8]:
print(REPEAT_DICT)

{('DEEHAN, WILLIAM N', '30004'): datetime.datetime(2017, 1, 31, 0, 0), ('ABBOTT, JOSEPH', '02895'): datetime.datetime(2017, 1, 12, 0, 0), ('SABOURIN, JAMES', '02895'): datetime.datetime(2017, 1, 31, 0, 0), ('JEROME, CHRISTOPHER', '30750'): datetime.datetime(2017, 10, 31, 0, 0)}


In [9]:
print(VALUES_DICT)

{('C00384516', '02895', '2018'): ['333', 333, 1]}
