In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle
import re

# グラフ描画
from matplotlib import pylab as plt
import seaborn as sns
%matplotlib inline
sns.set()

# 統計モデル
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA

# 線形回帰
from sklearn import linear_model

# 誤差の評価
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# ワーニングを表示させない
import warnings
warnings.filterwarnings('ignore')

# keras
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
InputDir = "/home/onimem/work/Open_world/NBA/Graph/single_attribute"
OutputDir = "/home/onimem/work/Open_world/NBA/DATA"

# 属性付き時系列グラフの読み込み
with open(InputDir + '/nba.graph', mode='rb') as f:
    graph = pickle.load(f)

# attributes
ATTRIBUTES = ['ATL', 'BRK', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHO', 'PHI', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
# edge types
EDGE_TYPE_LIST = ['team']

timestepIdx = 3

# 過去Mステップから将来Hステップを予測する
M = 10
H = 1
YEARS = [i for i in range(1951+M, 2018-(M+H-1))]

In [3]:
def RebuildAttributeGraph(graph, years):
    attribute_graph = {}
    for attribute in ATTRIBUTES:
        attribute_graph[attribute] = {}
        for year in years:
            attribute_graph[attribute][year] = {}
            attribute_graph[attribute][year]['nodes'] = {}
            attribute_graph[attribute][year]['edges'] = {}
            for name, tmp_attribute in graph[year]['nodes'].items():
                if attribute == tmp_attribute:
                    attribute_graph[attribute][year]['nodes'][name] = graph[year]['nodes'][name]
            name_list = list(attribute_graph[attribute][year]['nodes'].keys())
            for edge_type in EDGE_TYPE_LIST:
                attribute_graph[attribute][year]['edges'][edge_type] = []
                for edge in graph[year]['edges'][edge_type]:
                    if (edge[0] in name_list) and (edge[1] in name_list):
                        attribute_graph[attribute][year]['edges'][edge_type].append(edge)
    return attribute_graph

def GenerateYearsList(year, M, H):
    # train_year:Mステップ, test_year:Hステップ, years_all:M+Hステップ
    years_train = [(year+m) for m in range(M)]
    years_test = [(years_train[-1]+h+1) for h in range(H)]
    years_all = years_train.copy()
    years_all.extend(years_test)
    return years_train, years_test, years_all

########################################################################################################################################

def LinearRegression(years_all, years_train, node_num_train):
    # Learn
    X = pd.DataFrame(years_train)
    Y = pd.DataFrame(node_num_train)
    model = linear_model.LinearRegression()
    model.fit(X, Y)
    # predict
    px = np.array(years_all)[:,np.newaxis] # convert to 2D numpy array due to sklearn specification
    py = model.predict(px)
    py = py.reshape(-1).tolist()
    return py

def PlotHistory2(history, x, y1, y2, title, ylabel):
    plt.plot(history[x], history[y1], marker=".", color='#1ea8b5', label=y1)
    plt.plot(history[x], history[y2], marker=".", color='#ce405f', label=y2)
    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()
    
########################################################################################################################################

def GetNodes(year, graph, mode='all', M=0):
    def IsUnique(nodes_year1):
        return len(nodes_year1) == len(set(nodes_year1))

    def GetNewNodes(nodes_year1, nodes_year2):
        return set(nodes_year2) - set(nodes_year1)

    assert IsUnique(graph[year-1]['nodes']) and IsUnique(graph[year-1]['nodes']), 'some nodes are duplicate'
    if mode == 'unknown':
        nodes = GetNewNodes(graph[year-1]['nodes'], graph[year]['nodes']) - GetObservedNodeSet(graph, year-1, M)
    elif mode == 'return':
        nodes = GetNewNodes(graph[year-1]['nodes'], graph[year]['nodes']) - (GetNewNodes(graph[year-1]['nodes'], graph[year]['nodes']) - GetObservedNodeSet(graph, year-1, M))
    elif mode == 'lost':
        nodes = set(graph[year-1]['nodes']) - set(graph[year]['nodes'])
    elif mode == 'all':
        nodes = set(graph[year]['nodes'])
    elif mode == 'stay':
        nodes = set(graph[year-1]['nodes']) & set(graph[year]['nodes'])
    return nodes

def GetObservedNodeSet(graph, year, M):
    U = set()
    for i in range(M):
        U |= set(graph[year-i]['nodes'])
    return U

########################################################################################################################################

def GetNodeNum(year, graph, mode='all', M=0):
    return float(len(GetNodes(year, graph, mode, M)))

def GetNodeNumList(years, graph, mode='all', M=0):
    node_num = []
    for year in years:
        node_num.append(GetNodeNum(year, graph, mode, M))
    return node_num

def GetPredictedNodeNum(years_all, years_train, graph, plot_title='title', mode='all', M=0):
    # get num of nodes
    node_num_all = GetNodeNumList(years_all, graph, mode, M)
    node_num_train = GetNodeNumList(years_train, graph, mode, M)
    # learning & predict (LinearRegression)
    predicted_node_num_all = LinearRegression(years_all, years_train, node_num_train)
    # plot result
    #history = {'year':years_all, 'true':node_num_all, 'pred':predicted_node_num_all}
    #PlotHistory2(history, 'year', 'true', 'pred', title=plot_title, ylabel="# of node")
    # take out only test years
    predicted_node_num_test = list(map(lambda x: round(x), predicted_node_num_all[len(years_train):]))
    return predicted_node_num_test

def GetPredictedNodeNumGroupbyAttribute(years_all, years_train, attribute_graph, mode='all', M=0):
    predicted_node_num_groupby_attribute = {}
    for attribute, graph in attribute_graph.items():
        predicted_node_num_groupby_attribute[attribute] = GetPredictedNodeNum(years_all, years_train, graph, attribute, mode, M)
    return predicted_node_num_groupby_attribute

########################################################################################################################################

def GetInputGraph(graph, years_train, years_test, M, predicted_node_num_unknown_groupby_attribute):
    # generate node set of input graph
    V = GenerateNodeSetOfInputGraph(graph, years_train, years_test, M, predicted_node_num_unknown_groupby_attribute)
    # generate edge set of input graph
    E = GenerateEdgeSetOfInputGraph(graph, years_train, M, V)
    # generate input graph
    input_graph = GenerateInputGraph(V, E, graph, years_train)
    return input_graph

########################################################################################################################################

def GenerateNodeSetOfInputGraph(graph, years_train, years_test, M, predicted_node_num_unknown_groupby_attribute):
    # initialize node set
    V = set()
    # add observed node set of train year
    V |= GetObservedNodeSet(graph, years_train[-1], M)
    # add unknown nodes of test year
    V |= GetUnknownNodeSet(predicted_node_num_unknown_groupby_attribute, years_test)
    return V

def GetUnknownNodeSet(predicted_node_num_unknown_groupby_attribute, years_test):
    # initialize node set
    unknown_node_set = set()
    for i in range(len(years_test)):
        year = years_test[i]
        for attribute, predicted_node_num_unknown in predicted_node_num_unknown_groupby_attribute.items():
            for j in range(predicted_node_num_unknown[i]):
                unknown_node_set.add('unknown_' + attribute + '_' + str(year) + '_' + str(j))
    return unknown_node_set

########################################################################################################################################

def GenerateEdgeSetOfInputGraph(graph, years_train, M, V):
    # initialize edge set
    E = set()
    # add observed edge set of train year
    E |= GetObservedEdgeSet(graph, years_train[-1], M)
    # add unknown edges of test year
    E |= GetUnknownEdgeSet(years_train, graph, V, M)
    return E

def GetObservedEdgeSet(graph, year, M):
    D = set()
    for i in range(M):
        for edge_type in EDGE_TYPE_LIST:
            D |= {(edge[0], edge[1], edge_type, M-(i+1)) for edge in graph[year-i]['edges'][edge_type]}
    return D

def GetUnknownEdgeSet(years_train, graph, V, M):
    unknown_edge_set = set()
    for unknown_node in V:
        if not IsUnknownNode(unknown_node):
            continue
        attribute = GetAttributeFromUnknownNode(unknown_node)
        for edge_type in EDGE_TYPE_LIST:
            for similar_node in GetSimilarNodeSet(attribute, unknown_node, V, years_train, graph):
                unknown_edge_set |= ({(unknown_node, similar_node, edge_type, M), (similar_node, unknown_node, edge_type, M)})
    return unknown_edge_set

########################################################################################################################################

def IsUnknownNode(node):
    return node.split('_')[0] == 'unknown'

def GetAttributeFromUnknownNode(unknown_node):
    return unknown_node.split('_')[1]

def GetSimilarNodeSet(src_attribute, src_node, V, years_train, graph):
    similar_node_set = set()
    for dist_node in V:
        if dist_node == src_node:
            continue
        if IsUnknownNode(dist_node):
            dist_attribute = GetAttributeFromUnknownNode(dist_node)
        else:
            dist_attribute = GetAttributeFromObservedNode(dist_node, years_train, graph)
        if src_attribute == dist_attribute:
            similar_node_set.add(dist_node)
    return similar_node_set

# return the latest attribute of node
def GetAttributeFromObservedNode(node, years, graph):
    for year in years:
        for name, attribute in graph[year]['nodes'].items():
            if name == node:
                latest_attribute = attribute
    return latest_attribute

########################################################################################################################################

def GenerateInputGraph(V, E, graph, years_train):
    input_graph = {}
    input_graph['nodes'] = {}
    input_graph['edges'] = {}
    for node in V:
        input_graph['nodes'][node] = []
        for year in years_train:
            if node in graph[year]['nodes'].keys():
                input_graph['nodes'][node].append(graph[year]['nodes'][node])
            else:
                input_graph['nodes'][node].append('NULL')            
    input_graph['edges'] = E
    return input_graph

def PrintNodeNum(graph, years_test):
    print('all(= stay + return + unknown = all(-1) - lost + return + unknown)', len(GetNodes(years_test[0], graph, 'all', M)))
    print('stay', len(GetNodes(years_test[0], graph, 'stay', M)))
    print('return', len(GetNodes(years_test[0], graph, 'return', M)))
    print('unknown', len(GetNodes(years_test[0], graph, 'unknown', M)))
    print('lost (= all(-1) - stay)', len(GetNodes(years_test[0], graph, 'lost', M)))
    print('all(-1)', len(GetNodes(years_test[0]-1, graph, 'all', M)))
    print('input_graph & all (= stay + return)', len(set(input_graph['nodes']) & GetNodes(years_test[0], graph, 'all', M)))

In [4]:
def GetEncodeDicNode(input_graph):
    return {value: idx for idx, value in enumerate(set(input_graph['nodes'].keys()))}
    
def GetEncodeDicAttribute(attributes):
    attributes = attributes.copy()
    attributes.append('NULL')
    return {value: idx for idx, value in enumerate(attributes)}

def EncodeInputGraph(input_graph, encode_dic_node, encode_dic_attribute):
    # str2num (node)
    nodes_encoded = {encode_dic_node[node]: [to_categorical(encode_dic_attribute[att], len(encode_dic_attribute)) for att in attribute] for node, attribute in input_graph['nodes'].items()}
    # str2num (edge)
    edges_encoded = {(encode_dic_node[edge[0]], encode_dic_node[edge[1]], edge[2], edge[3]) for edge in input_graph['edges']}
    # dic2npy(node)
    nodes_npy = np.array([nodes_encoded[i] for i, _ in enumerate(nodes_encoded)]).transpose(1, 0, 2)
    # tuple2npy(edge) for each time step
    edges_npy = []
    for timestep in range(M+1):
        adjacency = np.zeros((len(nodes_encoded), len(nodes_encoded)))
        for edge in edges_encoded:
            if edge[timestepIdx] == timestep:
                adjacency[edge[0]][edge[1]] = 1
        assert np.allclose(adjacency.T, adjacency), 'Not an undirected graph'
        edges_npy.append(adjacency)
    edges_npy = np.array(edges_npy)
    # store
    input_graph_encoded = {}
    input_graph_encoded['nodes'] = nodes_npy
    input_graph_encoded['edges'] = edges_npy
    return input_graph_encoded

def GetLabel(input_graph, graph, years_test, M, encode_dic_node, encode_dic_attribute):
    
    def GenerateLabel(input_graph, graph, years_test, M):
        label_attribute={}
        label_lost = {}
        label_return = {}
        for node in input_graph['nodes'].keys():
            if node in GetNodes(years_test[0], graph, 'all', M):
                label_attribute[node] = graph[years_test[0]]['nodes'][node]
            else:
                label_attribute[node] = 'NULL'
            if node in GetNodes(years_test[0], graph, 'lost', M):
                label_lost[node] = 1
            else :
                label_lost[node] = 0
            if node in GetNodes(years_test[0], graph, 'return', M):
                label_return[node] = 1
            else :
                label_return[node] = 0
        assert sum([0 if attribute == 'NULL' else 1 for _, attribute in label_attribute.items()]) == len(set(input_graph['nodes']) & GetNodes(years_test[0], graph, 'all', M)), 'input_graph & graph[year_test] is wrong'
        assert sum([i for _, i in label_lost.items()]) == len(GetNodes(years_test[0], graph, 'lost', M)), 'lost is wrong'
        assert sum([i for _, i in label_return.items()]) == len(GetNodes(years_test[0], graph, 'return', M)), 'return is wrong'
        return label_attribute, label_lost, label_return
    
    label_attribute, label_lost, label_return = GenerateLabel(input_graph, graph, years_test, M)
    
    # str2num
    label_attribute_encoded = {encode_dic_node[node]: to_categorical(encode_dic_attribute[attribute], len(encode_dic_attribute)) for node, attribute in label_attribute.items()}
    label_lost_encoded = {encode_dic_node[node]: bool_value for node, bool_value in label_lost.items()}
    label_return_encoded = {encode_dic_node[node]: bool_value for node, bool_value in label_return.items()}
    # dic2npy
    label_attribute_npy = np.array([label_attribute_encoded[i] for i, _ in enumerate(label_attribute_encoded)])
    label_lost_npy = np.array([label_lost_encoded[i] for i, _ in enumerate(label_lost_encoded)])
    label_return_npy = np.array([label_return_encoded[i] for i, _ in enumerate(label_return_encoded)])
    return label_attribute_npy, label_lost_npy, label_return_npy

In [5]:
# Rebuild graph to primary attribute graph
attribute_graph = RebuildAttributeGraph(graph, [i for i in range(1950, 2018)])

data = []
for year in YEARS:
    # get time step list
    years_train, years_test, years_all = GenerateYearsList(year, M, H)
    
    # get # of nodes (unknown, return, lost)
    predicted_node_num_unknown_groupby_attribute = GetPredictedNodeNumGroupbyAttribute(years_all, years_train, attribute_graph, 'unknown', M)
    predicted_node_num_return = GetPredictedNodeNum(years_all, years_train, graph, 'return', 'return', M)
    predicted_node_num_lost = GetPredictedNodeNum(years_all, years_train, graph, 'lost', 'lost', M)
    
    # get input graph
    input_graph = GetInputGraph(graph, years_train, years_test, M, predicted_node_num_unknown_groupby_attribute)
    
    # encode & one-hot
    encode_dic_node = GetEncodeDicNode(input_graph)
    encode_dic_attribute = GetEncodeDicAttribute(ATTRIBUTES)
    input_graph_encoded = EncodeInputGraph(input_graph, encode_dic_node, encode_dic_attribute)

    # get label
    label_attribute, label_lost, label_return = GetLabel(input_graph, graph, years_test, M, encode_dic_node, encode_dic_attribute)
    
    data.append((M, year, input_graph_encoded['nodes'], input_graph_encoded['edges'], label_attribute, label_lost, label_return))
    print(year)

np.save(OutputDir + '/M_' + str(M) + '/data.npy', np.array(data))

1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
