In [33]:
import pandas as pd
import numpy as np
import itertools
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [13]:
# path finding
def find_min(open_dict, sink):
    if sink in open_dict.keys():
        return [sink, open_dict[sink][0], open_dict[sink][1]]
    values = list(open_dict.values())
    values.sort(key=lambda x: x[1])
    min = values[0][0]
    for i in open_dict.keys():
        if open_dict[i][0] == min:
            return [i, open_dict[i][0], open_dict[i][1]]


def show_path(close_dict, path, sink):
    if (sink in close_dict.keys()):
        path.append(sink)
        path = show_path(close_dict, path, close_dict[sink][1])
    return path


def find_path(g1, srce, sink):
    path = []
    open_dict = {srce: [0, -1]}
    close_dict = {}
    while len(open_dict.keys()) != 0:
        min_key = find_min(open_dict, sink)
        close_dict[min_key[0]] = [min_key[1], min_key[2]]  # [score, parent node]
        if min_key[0] == sink:
            return show_path(close_dict, path, sink)[::-1]
        else:
            subnodes = []
            if min_key[0] not in g1.keys():
                open_dict.pop(min_key[0])
                continue
            for i in g1[min_key[0]]:
                subnodes.append([i[0], i[1], min_key[0]])  # [current, score, parent node]
            for node in subnodes:
                if node[0] in close_dict.keys():
                    continue
                if node[0] not in open_dict.keys():
                    open_dict[node[0]] = [node[1], node[2]]
                elif node[0] in open_dict.keys():
                    if open_dict[node[0]][0] > node[1]:
                        open_dict[node[0]][0] = node[1]
                        open_dict[node[0]][1] = node[2]
            open_dict.pop(min_key[0])
    return []


def cal_score(g1, path):
    s = 0
    for i in range(len(path) - 1):
        for j in g1[path[i]]:
            if j[0] == path[i + 1]:
                s += j[1]
    return s

In [35]:
def loadTestData(filename):
    return pd.read_csv(filename)

# network graph drawing
def createG1():
    txt = []
    with open("train.txt") as f:
        l = f.readline()
        while l:
            txt.append(list(map(int, l.split())))
            l = f.readline()

    g1 = {}
    converted_txt = []
    tmp = []
    for link in txt:
        for subset in itertools.permutations(link, 2):
            tmp.append(subset[0])
            tmp.append(subset[1])
            converted_txt.append(subset)
    train1 = pd.DataFrame(converted_txt, columns=["srce", "dest"])
    freq = train1.groupby(["srce", "dest"]).size().values
    #train1['freq'] = freq
    txt_1 = sorted(set(converted_txt))
    df1 = pd.DataFrame(txt_1, columns=["srce", "dest"])
    df1['freq'] = freq
    for points, f in zip(txt_1, freq):
        g1[points[0]] = g1.setdefault(points[0], [])
        g1[points[0]].append((points[1], f))

    return g1, list(set(tmp)), df1

def fu(g1, u):
    return 1

def getNodeScore(g1, Nu):
    AA = 0
    RA = 0
    CCN = len(Nu)
    CRA = 0
    if len(Nu) == 0:
        return 0, 0, 0, 0
    for u in Nu:
        AA += 1/np.log(len(g1[u]))
        RA += 1/len(g1[u])
        CCN += fu(g1, u)
        CRA += fu(g1, u)/len(g1[u])
    return AA, RA, CCN, CRA

def Pxy(g1, x, y):
    try:
        Gx = g1[x]
        Gy = g1[y]
    except:
        return 0, 0, 0, 0, 0, 0
    Nx = [i[0] for i in Gx]
    Ny = [i[0] for i in Gy]
    NxINy = list(set(Nx) & set(Ny))
    NxUNy = list(set(Nx + Ny))
    AA, RA, CCN, CRA = getNodeScore(g1, NxINy)
    cardNx = len(Nx)
    cardNy = len(Ny)
    PA = cardNx * cardNy
    JC = len(NxINy)/len(NxUNy)
    return JC, RA, AA, PA, CCN, CRA

def extract_feature(g1, x, y):
    path = find_path(g1, x, y)
    l = len(path)
    features = {}
    if l == 2:
        JC, RA, AA, PA, CCN, CRA = Pxy(g1, x, y)
        features = {
            'JC': JC,
            'RA': RA,
            'AA': AA,
            'PA': PA,
            'CCN': CCN,
        }
    elif l > 2:
        features = {}
        for i in range(l - 1):
            JC, RA, AA, PA, CCN, CRA = Pxy(g1, path[i], path[i+1])
            features.update({
                'JC-'+str(path[i])+'-'+str(path[i+1]): JC,
                'RA-'+str(path[i])+'-'+str(path[i+1]): RA,
                'AA-'+str(path[i])+'-'+str(path[i+1]): AA,
                'PA-'+str(path[i])+'-'+str(path[i+1]): PA,
                'CCN-'+str(path[i])+'-'+str(path[i+1]): CCN,
            })
    else:
        features['FAKE'] = True
    return features
    

In [36]:
# Load data
test = loadTestData('test-public.csv')

# Load and Preprocess data
G1, V, df1 = createG1()

X = df1[['srce', 'dest']]
y = df1['freq']


In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
X_train1 = [extract_feature(G1, i[0], i[1]) for idx, i in X_train.iterrows()]
y_train1 = [i for i in y_train]

X_test1 = [extract_feature(G1, i[0], i[1]) for idx, i in X_test.iterrows()]
y_test1 = [i for i in y_test]

In [40]:
crf = sklearn_crfsuite.CRF(
    algorithm = 'l2sgd',
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions = True
)
crf.fit(X_train1, y_train1)

y_pred = crf.predict(X_test1)
metrics.flat_f1_score(y_test1, y_pred,
                      average='weighted', labels=labels)

TypeError: 'int' object is not iterable

TypeError: type() takes 1 or 3 arguments