# Machine Learning On Graphs - Homework 2
## Mohammad Bahrami - 9724133

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.ensemble import RandomForestClassifier

## Question 3

### Part C

In [2]:
A = np.array([
    [0.3, 0.4, 0.5],
    [0.3, 0.4, 0.3],
    [0.4, 0.2, 0.2]
])

In [3]:
eivals, eivecs = np.linalg.eig(A)
print('Eigen Values and their respective Eigen Vector:')
for eival, evec in zip(eivals, eivecs.T):
    print(f'\tEigen Value: {eival: .3f} - Eigen Vector: {evec}')

Eigen Values and their respective Eigen Vector:
	Eigen Value:  1.000 - Eigen Vector: [-0.66742381 -0.57207755 -0.47673129]
	Eigen Value: -0.200 - Eigen Vector: [-7.07106781e-01 -2.75020947e-17  7.07106781e-01]
	Eigen Value:  0.100 - Eigen Vector: [ 0.26726124 -0.80178373  0.53452248]


In [4]:
print('Eigen Values and their respective Normalized Eigen Vector:')
for eival, evec in zip(eivals, eivecs.T / np.linalg.norm(eivecs, axis=0)):
    print(f'\tEigen Value: {eival: .3f} - Eigen Vector: {evec}')

Eigen Values and their respective Normalized Eigen Vector:
	Eigen Value:  1.000 - Eigen Vector: [-0.66742381 -0.57207755 -0.47673129]
	Eigen Value: -0.200 - Eigen Vector: [-7.07106781e-01 -2.75020947e-17  7.07106781e-01]
	Eigen Value:  0.100 - Eigen Vector: [ 0.26726124 -0.80178373  0.53452248]


### Part D

In [5]:
r = np.zeros((A.shape[0], 1)) + 1/A.shape[0]
for i in range(20):
    r = A @ r
r = r / np.linalg.norm(r)
print('Page Rank Result: ')
print(r)

Page Rank Result: 
[[0.66742381]
 [0.57207755]
 [0.47673129]]


### Part E

We Can see that the result of the page rank is the same as the respective eigen vector of eigen value = 1

## Question 4

In [6]:
edge_index = np.loadtxt('soc-edges.txt')
nodes_df = pd.read_csv('soc-nodes.txt')
g = nx.Graph()
g.add_nodes_from(nodes_df.index.to_numpy())
g.add_edges_from(edge_index)
degree_vec = pd.DataFrame(dict(g.degree).items()).set_index(0).sort_index().loc[nodes_df['node']].to_numpy()
eigen_vec = pd.DataFrame(nx.eigenvector_centrality(g).items()).set_index(0).sort_index().loc[nodes_df['node']].to_numpy()
closeness_vec = pd.DataFrame(nx.closeness_centrality(g).items()).set_index(0).sort_index().loc[nodes_df['node']].to_numpy()
betweenness_vec = pd.DataFrame(nx.betweenness_centrality(g).items()).set_index(0).sort_index().loc[nodes_df['node']].to_numpy()
pagerank_vec = pd.DataFrame(nx.pagerank(g).items()).set_index(0).sort_index().loc[nodes_df['node']].to_numpy()
cluster_vec = pd.DataFrame(nx.clustering(g).items()).set_index(0).sort_index().loc[nodes_df['node']].to_numpy()

In [7]:
data = pd.DataFrame(columns=['node', 'degree', 'eigen', 'closeness', 'betweenness', 'clustering', 'pagerank'])
data['node'] = nodes_df['node']
data['degree'] = degree_vec
data['eigen'] = eigen_vec
data['closeness'] = closeness_vec
data['betweenness'] = betweenness_vec
data['clustering'] = cluster_vec
data['pagerank'] = pagerank_vec

data = data[['node', 'degree', 'closeness', 'betweenness', 'clustering', 'pagerank']]


nodes_label = pd.read_csv('./soc-nodes.txt')
# adding data labels to data
data = pd.merge(data,nodes_label, on = ['node'])

# splitting data to training and test sets
train = data[data['partition']=='train']
test = data[data['partition']=='test']

# prepare data for model
X_train = train.drop(['node', 'class', 'partition'], axis = 1)
X_test = test.drop(['node', 'class', 'partition'], axis = 1)
y_train = train['class']
y_test = test['class']

# training the model
model = RandomForestClassifier(max_depth=2, random_state=0)
model.fit(X_train, y_train)

# make prediction
test_prediction = model.predict(X_test)

# calculate the accuracy
true_predicted = 0
for i in range(len(test_prediction)):
    if test_prediction[i] == list(y_test)[i]:
        true_predicted +=1

numberOfTestNodes = len(y_test)

print('Accuracy: ', true_predicted/numberOfTestNodes)

Accuracy:  0.87


## Question 5

In [8]:
edge_index = np.loadtxt('soc-wiki-vote.txt')
g = nx.DiGraph()
g.add_edges_from(edge_index)

In [9]:
def get_pagerank(g: nx.DiGraph(), beta: float, threshold: float, max_iterations: int=100):
    adj = nx.to_numpy_array(g, nodelist=g.nodes).T
    adj[:, np.where(np.sum(adj, axis=0) == 0)] = 1 / adj.shape[0]
    adj = adj / np.sum(adj, axis=0, keepdims=1)
    r = np.zeros((adj.shape[0], 1)) + 1 / adj.shape[0]
    M = beta * adj + (1 - beta) * (np.zeros((adj.shape[0], 1)) + 1 / adj.shape[0])
    
    for i in range(max_iterations):
        prev_pg = r
        r = M @ r
        if (np.sum(np.abs(r - prev_pg)) / r.shape[0]) < threshold:
            return r, i

In [10]:
threshold = 1e-7
iden_thresh = 7
for beta in np.arange(0.1, 1, 0.1):
    pg, i = get_pagerank(g, beta, threshold)
    dev = np.std(pg)
    iden = np.all(np.round(pg, iden_thresh) == np.round(np.array(list(nx.pagerank(g, alpha=beta, tol=threshold).values())).reshape((-1, 1)), iden_thresh))
    print(f'Beta: {beta: .1f} - Iterations to Convergence: {i} - Standard Deviation: {dev: .6f} - Is Identical with nx.pagerank: {iden}')

Beta:  0.1 - Iterations to Convergence: 3 - Standard Deviation:  0.000188 - Is Identical with nx.pagerank: True
Beta:  0.2 - Iterations to Convergence: 4 - Standard Deviation:  0.000377 - Is Identical with nx.pagerank: True
Beta:  0.3 - Iterations to Convergence: 5 - Standard Deviation:  0.000568 - Is Identical with nx.pagerank: True
Beta:  0.4 - Iterations to Convergence: 6 - Standard Deviation:  0.000763 - Is Identical with nx.pagerank: True
Beta:  0.5 - Iterations to Convergence: 7 - Standard Deviation:  0.000962 - Is Identical with nx.pagerank: True
Beta:  0.6 - Iterations to Convergence: 8 - Standard Deviation:  0.001169 - Is Identical with nx.pagerank: True
Beta:  0.7 - Iterations to Convergence: 10 - Standard Deviation:  0.001384 - Is Identical with nx.pagerank: True
Beta:  0.8 - Iterations to Convergence: 12 - Standard Deviation:  0.001614 - Is Identical with nx.pagerank: True
Beta:  0.9 - Iterations to Convergence: 15 - Standard Deviation:  0.001864 - Is Identical with nx.page