# CXJ
## Key words
association analysis, fp-growth, cxj
## settings

In [None]:
!pip install pyfpgrowth==1.0

## preprocess

In [None]:
import pandas as pd
import pickle

# column field definition
REC_COL_DATE = "日期"
REC_COL_PLATE = "车牌"
REC_COL_S_CODE = "S码"

# raw files
raw_files = [
    './data/CXJ_20200101-20200331.xlsx',
    './data/CXJ_20200401-20200531.xlsx',
    './data/CXJ_20200601-20200731.xlsx',
    './data/CXJ_20200801-20200930.xlsx',
    './data/CXJ_20201001-20201130.xlsx',
    './data/CXJ_20201201-20201231.xlsx',
]

# files to save list of SCODE groups
pkl_files = [
    './data/CXJ_20200101-20200331.pkl',
    './data/CXJ_20200401-20200531.pkl',
    './data/CXJ_20200601-20200731.pkl',
    './data/CXJ_20200801-20200930.pkl',
    './data/CXJ_20201001-20201130.pkl',
    './data/CXJ_20201201-20201231.pkl',
]

rules_pkl_file = './data/rules_freq500_202001_12.pkl'

rules_txt_file = './data/rules_freq500_202001_12.txt'


def save_to_pickle(file, trans, mode='wb'):
    with open(file, mode) as f:
        pickle.dump(trans, f)
    print("file %s(%d) saved" % (file, len(trans)))

def load_from_pickle(file, mode='rb'):
    with open(file, mode) as f:
        trans = pickle.load(f)
    print("file %s(%d) loaded" % (file, len(trans)))
    return trans

def batch_save(from_files, to_files):
    for from_file, to_file in zip(from_files, to_files):
        trans = []
        df = pd.read_excel(from_file, skip_rows=1)
        # df = pd.read_csv(from_file, encoding="utf-8")
        df.groupby([REC_COL_DATE, REC_COL_PLATE]).apply(lambda x:trans.append(list(set(x[REC_COL_S_CODE]))))
        save_to_pickle(to_file, trans)

def batch_load(from_files):
    trans = []
    for from_file in from_files:
        trans.extend(load_from_pickle(from_file))
    return trans

In [None]:
# init service code dictionary
import pandas

DT_COL_S_CODE = "服务编码"
DT_COL_S_NAME = "服务名称"

def load_scode_dict(file):

    df = pd.read_csv(file, encoding="utf-8")
    dt = dict()

    def foo(x):
        dt[x[DT_COL_S_CODE]] = x[DT_COL_S_NAME]
    
    df.apply(foo, axis=1)
    return dt

scode_dict_file = "./data/service_info.csv"

scode_dict = load_scode_dict(scode_dict_file)

def fullname(scode):
    return "%s %s" % (scode, scode_dict.get(scode))

In [None]:
# save rules from .pkl to .txt
def save_rules(from_file, to_file, mode='a', l_limit=3):
    rules = load_from_pickle(from_file)
    with open(to_file, mode, encoding='utf-8') as f:
        title = "{}\t{}\t=>\t{}\t{}\t{}\n".format('商品服务组合', '组合数',\
            '衍生商品服务组合', '组合数', '可信度')
        f.write(title)

        for rule in rules:
            if len(rule) <= l_limit:
                l_list = list(map(fullname, rule))
                r_list = list(map(fullname, rules[rule][0]))
                line = "{}\t{}\t=>\t{}\t{}\t{:.0%}\n".format(l_list, len(l_list),\
                    r_list, len(r_list), rules[rule][1])
                f.write(line)
    print("file %s saved" % to_file)

In [None]:
# extract code list from raw records and save to pkl files
batch_save(raw_files, pkl_files)

## algo: fp-growth

In [None]:
# create fp-tree
import pyfpgrowth

rules_pkl_file = './data/rules_freq300_202001_12.pkl'
rules_txt_file = './data/rules_freq300_202001_12.txt'

trans = batch_load(pkl_files)
print("total size of transactions: %d" % len(trans))

# support frequency 500
patterns = pyfpgrowth.find_frequent_patterns(trans, 300)

# data mining with threholds of confidence 70%
rules = pyfpgrowth.generate_association_rules(patterns, 0.7)

# save rules to file
save_to_pickle(rules_pkl_file, rules)

## post process

In [None]:
rules_pkl_file = './data/rules_freq500_202001_12.pkl'
rules_txt_file = './data/rules_freq500_202001_12.txt'

save_rules(rules_pkl_file, rules_txt_file, mode='w', l_limit=3)

## Batch Example

In [None]:
import pyfpgrowth

trans = batch_load(pkl_files)
print("total size of transactions: %d" % len(trans))

# support frequency 500
patterns = pyfpgrowth.find_frequent_patterns(trans, 500)

# data mining with threholds of confidence 50%
rules = pyfpgrowth.generate_association_rules(patterns, 0.5)

# save rules to file
save_to_pickle('./data/rules_freq500_202001_09.pkl', rules)
save_rules('./data/rules_freq500_202001_09.pkl', './data/rules_freq500_202001_09.txt', l_limit=3)