In [None]:
import numpy as np
import pandas as pd
import datetime
import pickle
import re
from collections import defaultdict

from matplotlib import pylab as plt
import seaborn as sns
%matplotlib inline
sns.set()

from sklearn import linear_model
from keras.utils import to_categorical

In [None]:
InputDir = "/home/onimem/work/Open_world/NBA/Graph/multiattribute"
OutputDir = "/home/onimem/work/Open_world/NBA/DATA/"

ALL_YEARS = [year for year in range(1950, 2018)]
with open(InputDir + '/nba.graph', mode='rb') as f:
    graph_all_attribute = pickle.load(f)

# attribute0:Tm, attribute1:Pos, attribute2:Age, attribute3:PTS
# attribute0 is primary attribute.
n_attributes = 4
primary_attributeIdx = 0
ATTRIBUTE0 = ['ATL', 'BRK', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHO', 'PHI', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS', 'NULL']
ATTRIBUTE1 = ['C', 'C-F', 'F', 'F-C', 'F-G', 'G', 'G-F', 'PF', 'PG', 'SF', 'SG', 'NULL']
ATTRIBUTE2 = ['CONTINUOUS VALUE']
ATTRIBUTE3 = ['CONTINUOUS VALUE']
ATTRIBUTE_KEY = ['Tm', 'Pos', 'Age', 'PTS']
ATTRIBUTE_VALUE = [ATTRIBUTE0, ATTRIBUTE1, ATTRIBUTE2, ATTRIBUTE3]

# There is only one edge type, with time step as label.
EDGE_TYPE = 'team'
timestepIdx = 2

# Predicting one future step from past L step.
L = 2
YEARS = [i for i in range(1951+L, 2018-L)]

In [None]:
def GetIndex(value, list):
    if value in list:
        return list.index(value) + 1
    else:
        return 0

# (select attributes) Build Time Series Attributed Graph
graph = {}
for year in ALL_YEARS:
    graph[year] = {}
    graph[year]['nodes'] = {}
    for node, attribute in graph_all_attribute[year]['nodes'].items():
        graph[year]['nodes'][node] = [None for _ in range(n_attributes)]
        for att in attribute.keys():
            idx = GetIndex(att, ATTRIBUTE_KEY)
            if idx:
                graph[year]['nodes'][node][idx-1] = graph_all_attribute[year]['nodes'][node][att]
    graph[year]['edges'] = graph_all_attribute[year]['edges']

In [None]:
def RebuildAttributeGraph(graph, years):
    attribute_graph = {}
    for attribute in ATTRIBUTE_VALUE[primary_attributeIdx]:
        attribute_graph[attribute] = {}
        for year in years:
            attribute_graph[attribute][year] = {}
            attribute_graph[attribute][year]['nodes'] = {}
            attribute_graph[attribute][year]['edges'] = {}
            for name, attribute_list in graph[year]['nodes'].items():
                if attribute == attribute_list[primary_attributeIdx]:
                    attribute_graph[attribute][year]['nodes'][name] = graph[year]['nodes'][name]
            name_list = list(attribute_graph[attribute][year]['nodes'].keys())
            attribute_graph[attribute][year]['edges'][EDGE_TYPE] = []
            for edge in graph[year]['edges'][EDGE_TYPE]:
                if (edge[0] in name_list) and (edge[1] in name_list):
                    attribute_graph[attribute][year]['edges'][EDGE_TYPE].append(edge)
    return attribute_graph

def GenerateYearsList(year, L):
    years_train = [(year+l) for l in range(L)]
    year_test = years_train[-1]+1
    years_all = years_train.copy()
    years_all.extend([year_test])
    return years_train, year_test, years_all

########################################################################################################################################

def LinearRegression(years_all, years_train, node_num_train):
    # Learn
    X = pd.DataFrame(years_train)
    Y = pd.DataFrame(node_num_train)
    model = linear_model.LinearRegression()
    model.fit(X, Y)
    # predict
    px = np.array(years_all)[:,np.newaxis] # convert to 2D numpy array due to sklearn specification
    py = model.predict(px)
    py = py.reshape(-1).tolist()
    return py

def PlotHistory2(history, x, y1, y2, title, ylabel):
    plt.plot(history[x], history[y1], marker=".", color='#1ea8b5', label=y1)
    plt.plot(history[x], history[y2], marker=".", color='#ce405f', label=y2)
    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()

In [None]:
def GetNodes(year, graph, mode='all', L=0):
    def IsUnique(nodes_year1):
        return len(nodes_year1) == len(set(nodes_year1))

    def GetNewNodes(nodes_year1, nodes_year2):
        return set(nodes_year2) - set(nodes_year1)

    assert IsUnique(graph[year-1]['nodes']) and IsUnique(graph[year-1]['nodes']), 'some nodes are duplicate'
    if mode == 'unknown':
        nodes = GetNewNodes(graph[year-1]['nodes'], graph[year]['nodes']) - GetObservedNodeSet(graph, year-1, L)
    elif mode == 'return':
        nodes = GetNewNodes(graph[year-1]['nodes'], graph[year]['nodes']) - (GetNewNodes(graph[year-1]['nodes'], graph[year]['nodes']) - GetObservedNodeSet(graph, year-1, L))
    elif mode == 'lost':
        nodes = set(graph[year-1]['nodes']) - set(graph[year]['nodes'])
    elif mode == 'all':
        nodes = set(graph[year]['nodes'])
    elif mode == 'stay':
        nodes = set(graph[year-1]['nodes']) & set(graph[year]['nodes'])
    return nodes

def GetObservedNodeSet(graph, year, L):
    U = set()
    for i in range(L):
        U |= set(graph[year-i]['nodes'])
    return U

########################################################################################################################################

def GetNodeNum(year, graph, mode='all', L=0):
    return float(len(GetNodes(year, graph, mode, L)))

def GetNodeNumList(years, graph, mode='all', L=0):
    node_num = []
    for year in years:
        node_num.append(GetNodeNum(year, graph, mode, L))
    return node_num

def GetPredictedNodeNum(years_all, years_train, graph, plot_title='title', mode='all', L=0):
    # get num of nodes
    node_num_all = GetNodeNumList(years_all, graph, mode, L)
    node_num_train = GetNodeNumList(years_train, graph, mode, L)
    # learning & predict (LinearRegression)
    predicted_node_num_all = LinearRegression(years_all, years_train, node_num_train)
    # plot result
    #history = {'year':years_all, 'true':node_num_all, 'pred':predicted_node_num_all}
    #PlotHistory2(history, 'year', 'true', 'pred', title=plot_title, ylabel="# of node")
    # take out only test years
    predicted_node_num_test = round(predicted_node_num_all[len(years_train)])
    return predicted_node_num_test

def GetPredictedNodeNumGroupbyAttribute(years_all, years_train, attribute_graph, mode='all', L=0):
    predicted_node_num_groupby_attribute = {}
    for attribute, graph in attribute_graph.items():
        predicted_node_num_groupby_attribute[attribute] = GetPredictedNodeNum(years_all, years_train, graph, attribute, mode, L)
    return predicted_node_num_groupby_attribute

In [None]:
def GetInputGraph(graph, years_train, year_test, L, predicted_node_num_unknown_groupby_attribute):
    # generate node set of input graph
    V = GenerateNodeSetOfInputGraph(graph, years_train, year_test, L, predicted_node_num_unknown_groupby_attribute)
    # generate edge set of input graph
    E = GenerateEdgeSetOfInputGraph(graph, years_train, L, V)
    # generate input graph
    input_graph = GenerateInputGraph(V, E, graph, years_train)
    return input_graph

########################################################################################################################################

def GenerateNodeSetOfInputGraph(graph, years_train, year_test, L, predicted_node_num_unknown_groupby_attribute):
    # initialize node set
    V = set()
    # add observed node set of train year
    V |= GetObservedNodeSet(graph, years_train[-1], L)
    # add unknown nodes of test year
    V |= GetUnknownNodeSet(predicted_node_num_unknown_groupby_attribute, year_test)
    return V

def GetUnknownNodeSet(predicted_node_num_unknown_groupby_attribute, year_test):
    # initialize node set
    unknown_node_set = set()
    for attribute, predicted_node_num_unknown in predicted_node_num_unknown_groupby_attribute.items():
        for i in range(predicted_node_num_unknown):
            unknown_node_set.add('unknown_' + attribute + '_' + str(year_test) + '_' + str(i))
    return unknown_node_set

########################################################################################################################################

def GenerateEdgeSetOfInputGraph(graph, years_train, L, V):
    # initialize edge set
    E = set()
    # add observed edge set of train year
    E |= GetObservedEdgeSet(graph, years_train[-1], L)
    # add unknown edges of test year
    E |= GetUnknownEdgeSet(years_train, graph, V, L)
    return E

def GetObservedEdgeSet(graph, year, L):
    D = set()
    for i in range(L):
        D |= {(edge[0], edge[1], L-(i+1)) for edge in graph[year-i]['edges'][EDGE_TYPE]}
    return D

def GetUnknownEdgeSet(years_train, graph, V, L):
    unknown_edge_set = set()
    for unknown_node in V:
        if not IsUnknownNode(unknown_node):
            continue
        attribute = GetAttributeFromUnknownNode(unknown_node)
        for similar_node in GetSimilarNodeSet(attribute, unknown_node, V, years_train, graph):
            unknown_edge_set |= ({(unknown_node, similar_node, L), (similar_node, unknown_node, L)})
    return unknown_edge_set

########################################################################################################################################

def IsUnknownNode(node):
    return node.split('_')[0] == 'unknown'

def GetAttributeFromUnknownNode(unknown_node):
    return unknown_node.split('_')[1]

def GetSimilarNodeSet(src_attribute, src_node, V, years_train, graph):
    similar_node_set = set()
    for dist_node in V:
        if dist_node == src_node:
            continue
        if IsUnknownNode(dist_node):
            dist_attribute = GetAttributeFromUnknownNode(dist_node)
        else:
            dist_attribute = GetAttributeFromObservedNode(dist_node, years_train, graph)
        if src_attribute == dist_attribute:
            similar_node_set.add(dist_node)
    return similar_node_set

# return the latest attribute of node
def GetAttributeFromObservedNode(node, years, graph):
    for year in years:
        for name, attribute in graph[year]['nodes'].items():
            if name == node:
                latest_attribute = attribute[primary_attributeIdx]
    return latest_attribute

########################################################################################################################################

def GetStoredAttributes(node, node_set):
    if node in node_set.keys():
        attribute = node_set[node]
    else:
        attribute = [None for _ in range(n_attributes)]
        for idx in range(n_attributes):
            if ATTRIBUTE_VALUE[idx][0] == 'CONTINUOUS VALUE':
                attribute[idx] = 0
            else:
                attribute[idx] = 'NULL'
    return attribute

def GenerateInputGraph(V, E, graph, years_train):
    input_graph = {}
    input_graph['nodes'] = {}
    input_graph['edges'] = {}
    for node in V:
        input_graph['nodes'][node] = []
        for year in years_train:
            input_graph['nodes'][node].append(GetStoredAttributes(node, graph[year]['nodes']))
    input_graph['edges'] = E
    return input_graph

In [None]:
def GetEncodeDicNode(input_graph):
    return {value: idx for idx, value in enumerate(set(input_graph['nodes'].keys()))}

def GetEncodeDicAttribute(attribute_value_list, input_graph):
    encode_dic_list = []
    for idx, attribute_value in enumerate(attribute_value_list):
        if attribute_value[0] == 'CONTINUOUS VALUE':
            x = np.array([attributes[idx] for _, temporal_attributes in input_graph['nodes'].items() for attributes in temporal_attributes])
            encode_dic_list.append({'mean': x.mean(), 'std': x.std()})
        else :
            encode_dic_list.append({value: idx for idx, value in enumerate(attribute_value)})
    return encode_dic_list

def EncodeNodeAttribute(input_nodes, encode_dic_node, encode_dic_attribute_list, onehot=True):
    # str2num&normalize
    nodes_encoded = {}
    for node, temporal_attributes in input_nodes.items():
        n_timestep = len(temporal_attributes)
        attribute_encoded = [[None for _ in range(n_attributes)] for _ in range(n_timestep)]
        for timestep in range(n_timestep):
            for idx, encode_dic_attribute in enumerate(encode_dic_attribute_list):
                if set(['mean', 'std']) == encode_dic_attribute.keys():
                    mean = encode_dic_attribute['mean']
                    std = encode_dic_attribute['std']
                    attribute_encoded[timestep][idx] = (temporal_attributes[timestep][idx] - mean) / std
                else:
                    if onehot:
                        attribute_encoded[timestep][idx] = to_categorical(encode_dic_attribute[temporal_attributes[timestep][idx]], len(encode_dic_attribute)).tolist()
                    else:
                        attribute_encoded[timestep][idx] = encode_dic_attribute[temporal_attributes[timestep][idx]]
        # flatten
        attribute = [[] for _ in range(n_timestep)]
        for timestep in range(n_timestep):
            for idx in range(n_attributes):
                attribute[timestep].extend(attribute_encoded[timestep][idx] if isinstance(attribute_encoded[timestep][idx], list) else [attribute_encoded[timestep][idx]])
        nodes_encoded[encode_dic_node[node]] = attribute
    
    # dic2npy
    nodes_npy = np.array([nodes_encoded[i] for i, _ in enumerate(nodes_encoded)]).reshape(len(nodes_encoded), n_timestep, -1).copy()
    return nodes_npy

def EncodeInputGraph(input_graph, encode_dic_node, encode_dic_attribute_list, L):
    # node encode
    nodes_npy = EncodeNodeAttribute(input_graph['nodes'], encode_dic_node, encode_dic_attribute_list)

    # edge encode
    n_nodes = nodes_npy.shape[0]
    edges_encoded = np.zeros([n_nodes, n_nodes * (L + 1)])
    for edge in input_graph['edges']:
        src_idx = encode_dic_node[edge[0]]
        tgt_idx = encode_dic_node[edge[1]]
        timestep = edge[timestepIdx]
        edges_encoded[tgt_idx][timestep * n_nodes + src_idx] = 1
    
    input_graph_encoded = {}
    input_graph_encoded['nodes'] = nodes_npy
    input_graph_encoded['edges'] = edges_encoded
    return input_graph_encoded

########################################################################################################################################

def GetLabel(input_graph, graph, year_test, L, encode_dic_node, encode_dic_attribute_list):

    def GenerateLabel(input_graph, graph, year_test, L):
        label_attribute={}
        label_lost = {}
        label_return = {}
        for node in input_graph['nodes'].keys():
            label_attribute[node] = [GetStoredAttributes(node, graph[year_test]['nodes'])] # 2-dim list EncodeNodeAttribute specification (timestepidx)
            if node in GetNodes(year_test, graph, 'lost', L):
                label_lost[node] = 1
            else :
                label_lost[node] = 0
            if node in GetNodes(year_test, graph, 'return', L):
                label_return[node] = 1
            else :
                label_return[node] = 0
        return label_attribute, label_lost, label_return

    label_attribute, label_lost, label_return = GenerateLabel(input_graph, graph, year_test, L)
    
    label_attribute_npy = EncodeNodeAttribute(label_attribute, encode_dic_node, encode_dic_attribute_list, onehot=False)[:,0]
    # str2num
    label_lost_encoded = {encode_dic_node[node]: bool_value for node, bool_value in label_lost.items()}
    label_return_encoded = {encode_dic_node[node]: bool_value for node, bool_value in label_return.items()}
    # dic2npy
    label_lost_npy = np.array([label_lost_encoded[i] for i, _ in enumerate(label_lost_encoded)])
    label_return_npy = np.array([label_return_encoded[i] for i, _ in enumerate(label_return_encoded)])
    return label_attribute_npy, label_lost_npy, label_return_npy

In [None]:
# Rebuild graph to primary attribute graph
attribute_graph = RebuildAttributeGraph(graph, [i for i in range(1950, 2018)])

save_graph_data = []
for year in YEARS:
    # get time step list
    years_train, year_test, years_all = GenerateYearsList(year, L)

    # get # of nodes (unknown, return, lost)
    predicted_node_num_unknown_groupby_attribute = GetPredictedNodeNumGroupbyAttribute(years_all, years_train, attribute_graph, 'unknown', L)
    predicted_node_num_return = GetPredictedNodeNum(years_all, years_train, graph, 'return', 'return', L)
    predicted_node_num_lost = GetPredictedNodeNum(years_all, years_train, graph, 'lost', 'lost', L)

    # get input graph
    input_graph = GetInputGraph(graph, years_train, year_test, L, predicted_node_num_unknown_groupby_attribute)

    # encode & one-hot
    encode_dic_node = GetEncodeDicNode(input_graph)
    encode_dic_attribute_list = GetEncodeDicAttribute(ATTRIBUTE_VALUE, input_graph)
    input_graph_encoded = EncodeInputGraph(input_graph, encode_dic_node, encode_dic_attribute_list, L)
    
    # get label
    label_attribute, label_lost, label_return = GetLabel(input_graph, graph, year_test, L, encode_dic_node, encode_dic_attribute_list)

    print(input_graph_encoded['nodes'].shape)
    print(input_graph_encoded['edges'].shape)
    print(label_attribute.shape)
    print(label_lost.shape)
    print(label_return.shape)
    
    save_graph_data.append((input_graph_encoded['nodes'], input_graph_encoded['edges'], label_attribute, label_lost, label_return, encode_dic_node, encode_dic_attribute_list, L, year, predicted_node_num_return, predicted_node_num_lost))

np.save(OutputDir + '/L_' + str(L) + '_save_graph_data.npy', np.array(save_graph_data))
