In [None]:
import sys
import numpy as np
import sklearn_crfsuite
import time
from tqdm import tqdm
import pandas as pd
import prepare_train_with_set as prepare
import os
import func
import set_func

In [None]:
def process_training(set_total, current_path):
    train_data, Set_dict = prepare.train_file_generate(set_total, current_path)
    test_data = prepare.test_file_generate(current_path)
    max_num_train, max_label_train = func.load_data_num(train_data, True)
    max_num_test = func.load_data_num(test_data, False)
    max_num = max(max_num_train, max_num_test)
    col_set_dict = dict(map(reversed, Set_dict.items()))
    feature_train, label_train, out_train = func.CRFSuite_process_data(train_data)
    page_num = int(len(feature_train)/max_num)
    feature_train = feature_train.tolist()
    label_train = label_train.tolist()
    X_train = [func.sent2features(feature_train)]
    y_train = [func.sent2labels(label_train)]
    return test_data, feature_train, X_train, y_train, max_label_train, max_num, col_set_dict

def model():
    return sklearn_crfsuite.CRF(
        c1=0.1,
        max_iterations=50
    )

def process_testing(test_data, current_path, max_num):
    feature_test, label_test, _ = func.CRFSuite_process_data(test_data)
    feature_test = feature_test.tolist()
    label_test = label_test.tolist()
    X_test = [func.sent2features(feature_test)]
    page_test = int(len(feature_test)/max_num)
    return feature_test, X_test, page_test

def process_output(y_pred, page_test, max_num, current_path, set_total, model_name, max_label_train, col_set_dict, feature_train, feature_test):
    y_pred = np.array(y_pred)
    result = np.reshape(y_pred, [page_test, max_num])
    result = result.astype(np.int32)
    
    col_type = func.get_col_type(current_path)
    Set_data = func.predict_output(set_total, current_path, model_name, col_type, result, max_label_train, col_set_dict)
    set_train_data, set_train_count = set_func.Set_train_file_generate(set_total, current_path, model_name, feature_train, max_num)
    set_test_data, set_test_count = set_func.Set_test_file_generate(set_total, current_path, model_name, Set_data, feature_test, max_num)
    page_c = len(result)
    return result, Set_data, (set_train_data, set_train_count), (set_test_data, set_test_count), page_c

def process_set(set_total, set_train, set_test, current_path, result, t, ts):
    set_train_data = set_train[0]
    set_train_count = set_train[1]
    set_test_data = set_test[0]
    set_test_count = set_test[1]
    max_num_train = set_func.max_num_set(set_total, set_train_count)
    max_num_test = set_func.max_num_set(set_total, set_test_count)
    max_set = []
    for i in range(len(max_num_train)):
        max_set.append(max(max_num_train[i], max_num_test[i]))

    for num in range(set_total):
        set_feature_train, set_label_train = set_func.CRFSuite_process_set(set_train_data[num], set_train_count, num, max_set)
        max_num = max_set[num]
        max_label = max(set_train_data[num]['Label'])
        page_num = int(len(set_feature_train)/max_num)
        crf = model()
        set_feature_train = set_feature_train.tolist()
        set_label_train = set_label_train.tolist()
        X_train = [func.sent2features(set_feature_train)]
        y_train = [func.sent2labels(set_label_train)]

        # Train
        start = time.time()
        crf.fit(X_train, y_train)
        t += time.time()-start

        # Load Test file
        set_feature_test, set_label_test = set_func.CRFSuite_process_set(set_test_data[num], set_test_count, num, max_set)
        set_feature_test = set_feature_test.tolist()
        set_label_test = set_label_test.tolist()
        X_test = [func.sent2features(set_feature_test)]
        page_test = int(len(set_feature_test) / max_num)

        # Prediction
        ts_start = time.time()
        y_pred = crf.predict(X_test)
        ts += time.time() - ts_start
        
        y_pred = np.array(y_pred)
        result = np.reshape(y_pred, [page_test, max_num])
        result = result.astype(np.int64)
        # Read Col
        col_type = set_func.get_col_type(current_path, num)

        # Output
        set_func.predict_output(current_path, model_name, num, col_type, result, max_label, set_feature_test, max_num)
    return t, ts

def process_time(current_path, model_name, t, ts, page_c):
    with open(os.path.join(current_path, model_name, "data", "time_crf.txt"),"w") as timef:
        print("\ntrain time:"+str(t))
        timef.write("train:"+str(t)+"\n")
        print("test time:"+str(ts))
        print("per page:"+ str(float(ts)/page_c)+"\n")
        timef.write("test:"+str(ts)+"\n")
        timef.write("per page:"+ str(float(ts)/page_c)+"\n")

In [None]:
if __name__ == "__main__":
    # How many Set
    set_total = 3
    model_name = "crfsuite"
    current_path = os.path.join(os.path.expanduser("~"), "jupyter", "web_verification")
    
    # Process training file
    test_data, feature_train, X_train, y_train, max_label_train, max_num, col_set_dict = process_training(set_total, current_path)
    
    # Define model
    crf = model()
    
    # Start training
    start = time.time()
    crf.fit(X_train, y_train)
    t = time.time() - start
    
    # Process testing file
    feature_test, X_test, page_test = process_testing(test_data, current_path, max_num)
    
    # Start testing
    ts_start = time.time()
    y_pred = crf.predict(X_test)
    ts = time.time() - ts_start
    
    # Process output
    result, Set_data, set_train, set_test, page_c = process_output(y_pred, page_test, max_num, current_path, 
                                                                   set_total, model_name, max_label_train, 
                                                                   col_set_dict, feature_train, feature_test)
    
    # Process set
    if set_total > 0:
        t, ts = process_set(set_total, set_train, set_test, current_path, result, t, ts)
            
    # Process time
    process_time(current_path, model_name, t, ts, page_c)