In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle


In [2]:
def create_graph(path):
    edges = []
    with open(path) as f:
        for line in f:
            one_row = line.split()
            (u, v) = int(one_row[0]), int(one_row[1])
            edges.append((u,v))

    G = nx.Graph()
    G.add_edges_from(edges)
    return G

In [3]:
def dict_to_df(dictionary, name):
    data_dict = dict(dictionary)

    data_items = data_dict.items()
    data_list = list(data_items)
    df = pd.DataFrame(data_list, columns=['node', name])
    return df

In [4]:
def create_dataset(nodes_label):
    G = create_graph('./soc-edges.txt')
    data = nodes_label
    data = pd.merge(data, dict_to_df(nx.degree(G), name='degree'), on = ['node'])
    data = pd.merge(data, dict_to_df(nx.betweenness_centrality(G), name='betweenness'), on = ['node'])
    #data = pd.merge(data, dict_to_df(nx.eigenvector_centrality(G), name='eigenvector'), on = ['node'])
    data = pd.merge(data, dict_to_df(nx.closeness_centrality(G), name='closeness'), on = ['node'])
    data = pd.merge(data, dict_to_df(nx.clustering(G), name='clustering'), on = ['node'])
    
    #data = pd.merge(data, dict_to_df(ego, name='ego'), on = ['node'])
    
    
    return data

In [5]:
def main():

    ########################### your code goes here #########################
    # input: undirected graph readed from edges.txt
    # output: DataFrame including one column named 'node' 
    # for node numbers , and other column(s) including node
    # feature(s)



    ###########################################################################

    nodes_label = pd.read_csv('./soc-nodes.txt')
    # adding data labels to data
    data = create_dataset(nodes_label)

    # splitting data to training and test sets
    train = data[data['partition']=='train']
    test = data[data['partition']=='test']
    
    
    # accuracy of random forest is related to the order of training samples
    # if you set random_state=0 in model, shuffle the data
    '''
    train = shuffle(train)
    test = shuffle(test)
    '''
    
    # prepare data for model
    X_train = train.drop(['node', 'class', 'partition'], axis = 1)
    X_test = test.drop(['node', 'class', 'partition'], axis = 1)
    y_train = train['class']
    y_test = test['class']

    # training the model
    
    model = RandomForestClassifier(max_depth=2, random_state=0)
    model.fit(X_train, y_train)

    # make prediction
    test_prediction = model.predict(X_test)

    # calculate the accuracy
    true_predicted = 0
    for i in range(len(test_prediction)):
        if test_prediction[i] == list(y_test)[i]:
            true_predicted +=1

    numberOfTestNodes = len(y_test)
    
    print('Accuracy: ', true_predicted/numberOfTestNodes)

if __name__ == '__main__':
    
    main()

Accuracy:  0.875
