# CXJ
## preprocess

In [2]:
import pandas as pd
import pickle

# column field definition
REC_COL_DATE = "日期"
REC_COL_PLATE = "车牌"
REC_COL_S_CODE = "S码"

# raw files
raw_files = [
    './data/CXJ_202001_03.csv',
    './data/CXJ_202004_05.csv',
    './data/CXJ_202006_07.csv',
    './data/CXJ_202008_09.csv',
]

# files to save list of SCODE groups
pkl_files = [
    # './data/CXJ_202001_03.pkl',
    './data/CXJ_202004_05.pkl',
    # './data/CXJ_202006_07.pkl',
    # './data/CXJ_202008_09.pkl',
]

rules_pkl_files = [
    # './data/rules_202001_03.pkl',
    './data/rules_202004_05.pkl',
    # './data/rules_202006_07.pkl',
    # './data/rules_202008_09.pkl',
]

rules_txt_files = [
    # './data/rules_202001_03.txt',
    './data/rules_202004_05.txt',
    # './data/rules_202006_07.txt',
    # './data/rules_202008_09.txt',
]


def save_to_pickle(file, trans, mode='wb'):
    with open(file, mode) as f:
        pickle.dump(trans, f)
    print("file %s(%d) saved" % (file, len(trans)))

def load_from_pickle(file, mode='rb'):
    with open(file, mode) as f:
        trans = pickle.load(f)
    print("file %s(%d) loaded" % (file, len(trans)))
    return trans

def batch_save(from_files, to_files):
    for from_file, to_file in zip(from_files, to_files):
        trans = []
        df = pd.read_csv(from_file, encoding="utf-8")
        df.groupby([REC_COL_DATE, REC_COL_PLATE]).apply(lambda x:trans.append(list(set(x[REC_COL_S_CODE]))))
        save_to_pickle(to_file, trans)

def batch_load(from_files):
    trans = []
    for from_file in from_files:
        trans.extend(load_from_pickle(from_file))
    return trans

In [None]:
# extract code list from raw records and save to pkl files
batch_save(raw_files, pkl_files)

In [None]:
# volumn is too big to compute
# trans = batch_load(pkl_files)

## algo: fp-growth

In [3]:
# create fp-tree
import pyfpgrowth


for from_file, to_file in zip(pkl_files, rules_pkl_files):
    trans = load_from_pickle(from_file)

    # at least 2 elements for each pair
    patterns = pyfpgrowth.find_frequent_patterns(trans, 2)

    # data mining with threholds of confidence 70%
    rules = pyfpgrowth.generate_association_rules(patterns, 0.7)

    # save rules to file
    save_to_pickle(to_file, rules)

file ./data/CXJ_202004_05.pkl(693020) loaded


MemoryError: 

## post process

In [31]:
# init service code dictionary
import pandas

DT_COL_S_CODE = "服务编码"
DT_COL_S_NAME = "服务名称"

def load_scode_dict(file):

    df = pd.read_csv(file, encoding="utf-8")
    dt = dict()

    def foo(x):
        dt[x[DT_COL_S_CODE]] = x[DT_COL_S_NAME]
    
    df.apply(foo, axis=1)
    return dt

scode_dict_file = "./data/service_info.csv"

scode_dict = load_scode_dict(scode_dict_file)

def fullname(scode):
    return "%s %s" % (scode, scode_dict.get(scode))

In [51]:
def save_rules(from_file, to_file, mode='a', l_limit=1):
    rules = load_from_pickle(from_file)
    with open(to_file, mode, encoding='utf-8') as f:
        for rule in rules:
            if len(rule) <= l_limit:
                line = "{}\t=>\t{}\t{:.0%}\n".format(list(map(fullname, rule)), \
                    list(map(fullname, rules[rule][0])), rules[rule][1])
                f.write(line)
    print("file %s saved" % to_file)


for from_file, to_file in zip(pkl_files, rules_files):
    save_rules(from_file, to_file)

file ./data/rules_on_1_3.pkl(623105) loaded
file ./data/rules_on_1_3.txt saved
