# Data cleaning and link prediction between actors

## Necessary libraries

In [None]:
import pandas as pd
import networkx as nx
from tqdm import tqdm_notebook as tqdm
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pickle

## Link prediction

### Read in pickles from github

In [None]:
!rm -rf actors-collaboration # removing the folder so it wouldn't duplicate it
!git clone https://github.com/kkristjank/actors-collaboration.git
!cp -R actors-collaboration/graph_train.gpickle . # copying the graph_train.gpickle from the subfolder to main folder
!cp -R actors-collaboration/graph_test.gpickle . # copying the graph_test.gpickle from the subfolder to main folder
!cp -R actors-collaboration/df_train . # copying the df_train from the subfolder to main folder
!cp -R actors-collaboration/df_test . # copying the df_test from the subfolder to main folder

Cloning into 'actors-collaboration'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (79/79), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 79 (delta 26), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (79/79), done.


### DataFrame - removing actors who are connected only due to one movie

In [None]:
df = pd.read_pickle("df_train")

In [None]:
df2 = df.groupby(["actor1", "actor2"]).size().reset_index(name="NbrOfMovies")

In [None]:
df3 = df2[df2.NbrOfMovies > 1]

In [None]:
df = df3

In [None]:
df = df.drop(columns=["NbrOfMovies"])
df["link"] = 1

In [None]:
G = nx.from_pandas_edgelist(df, "actor1", "actor2", create_using=nx.Graph())

### Node list and adjacency matrix

In [None]:
# combine all nodes in a list
node_list = np.unique(G.nodes)
len(node_list)

3289

In [None]:
# build adjacency matrix
adj_G = nx.to_numpy_matrix(G, nodelist = node_list)

### Unconnected node-pairs

In [None]:
# get unconnected node-pairs
all_unconnected_pairs = []

# traverse adjacency matrix
offset = 0
for i in tqdm(range(adj_G.shape[0])):
  for j in range(offset,adj_G.shape[1]):
    if i != j:      
      try:
        if nx.shortest_path_length(G, node_list[i], node_list[j]) <=2:
          if adj_G[i,j] == 0:
            all_unconnected_pairs.append([node_list[i],node_list[j]])
      except:
        pass

  offset = offset + 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=3289.0), HTML(value='')))




In [None]:
node_1_unlinked = [i[0] for i in all_unconnected_pairs]
node_2_unlinked = [i[1] for i in all_unconnected_pairs]

data = pd.DataFrame({'actor1':node_1_unlinked, 
                     'actor2':node_2_unlinked})

# add target variable 'link'
data['link'] = 0

### Omissible links

In [None]:
# Removing links from connected node pairs - positive samples - to have smth with target = 1
initial_node_count = len(G.nodes)
ncc = nx.number_connected_components(G)

df_temp = df.copy()

# empty list to store removable links
omissible_links_index = []

for i in tqdm(df.index.values):
  
  # remove a node pair and build a new graph
  G_temp = nx.from_pandas_edgelist(df_temp.drop(index = i), "actor1", "actor2", create_using=nx.Graph())
  
  # check there is no spliting of graph and number of nodes is same
  if (nx.number_connected_components(G_temp) == ncc) and (len(G_temp.nodes) == initial_node_count):
    omissible_links_index.append(i)
    df_temp = df_temp.drop(index = i)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=3086.0), HTML(value='')))




In [None]:
# create dataframe of removable edges
df_ghost = df.loc[omissible_links_index]

# add the target variable 'link'
df_ghost['link'] = 1

data = data.append(df_ghost[['actor1', 'actor2', 'link']], ignore_index=True)

In [None]:
data['link'].value_counts()

0    2553
1     804
Name: link, dtype: int64

In [None]:
# drop removable edges
df_partial = df.drop(index=df_ghost.index.values)

# build graph
G_data = nx.from_pandas_edgelist(df_partial, "actor1", "actor2", create_using=nx.Graph())

### Node2Vec

In [None]:
!pip install node2vec -q
from node2vec import Node2Vec

# Generate walks
node2vec = Node2Vec(G_data, dimensions=100, walk_length=16, num_walks=50)

# train node2vec model
n2w_model = node2vec.fit(window=7, min_count=1)

  Building wheel for node2vec (setup.py) ... [?25l[?25hdone


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=3289.0, style=Pr…

Generating walks (CPU: 1):   0%|          | 0/50 [00:00<?, ?it/s]




Generating walks (CPU: 1): 100%|██████████| 50/50 [01:35<00:00,  1.90s/it]


In [None]:
x = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['actor1'], data['actor2'])]

  """Entry point for launching an IPython kernel.


### Train and test data splitting

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

xtrain, xtest, ytrain, ytest = train_test_split(np.array(x), data['link'], 
                                                test_size = 0.3)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight="balanced")

lr.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
predictions = lr.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

0.6967180409327169

In [None]:
y_pred = lr.predict(xtest)
accuracy_score(ytest, y_pred)

0.6984126984126984

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
predictions = rfc.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

0.795738081684551

In [None]:
y_pred = rfc.predict(xtest)
accuracy_score(ytest, y_pred)

0.816468253968254

### MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier(random_state=1, max_iter=300)

mlpc.fit(xtrain, ytrain)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [None]:
predictions = mlpc.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

0.7290056233274991

In [None]:
y_pred = mlpc.predict(xtest)
accuracy_score(ytest, y_pred)

0.8095238095238095

## Predicting 2019-2020

In [None]:
df_test = pd.read_pickle("df_test")

In [None]:
df_test = df_test[df_test.actor1.isin(df.actor1) | df_test.actor1.isin(df.actor2)]
df_test = df_test[df_test.actor2.isin(df.actor1) | df_test.actor2.isin(df.actor2)]

In [None]:
df2 = df_test.groupby(["actor1", "actor2"]).size().reset_index(name="NbrOfMovies")

In [None]:
df3 = df2[df2.NbrOfMovies > 1]

In [None]:
df = df3

In [None]:
df = df.drop(columns=["NbrOfMovies"])
df_test["link"] = 1

In [None]:
G = nx.from_pandas_edgelist(df_test, "actor1", "actor2", create_using=nx.Graph())

In [None]:
# combine all nodes in a list
node_list = np.unique(G.nodes)
len(node_list)

756

In [None]:
# build adjacency matrix
adj_G = nx.to_numpy_matrix(G, nodelist = node_list)

In [None]:
# get unconnected node-pairs
all_unconnected_pairs = []

# traverse adjacency matrix
offset = 0
for i in tqdm(range(adj_G.shape[0])):
  for j in range(offset,adj_G.shape[1]):
    if i != j:      
      try:
        if nx.shortest_path_length(G, node_list[i], node_list[j]) <=2:
          if adj_G[i,j] == 0:
            all_unconnected_pairs.append([node_list[i],node_list[j]])
      except:
        pass

  offset = offset + 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=756.0), HTML(value='')))




In [None]:
len(all_unconnected_pairs)

1108

In [None]:
node_1_unlinked = [i[0] for i in all_unconnected_pairs]
node_2_unlinked = [i[1] for i in all_unconnected_pairs]

data = pd.DataFrame({'actor1':node_1_unlinked, 
                     'actor2':node_2_unlinked})

# add target variable 'link'
data['link'] = 0

In [None]:
data = data.append(df_test)

In [None]:
data['link'].value_counts()

0    1108
1     856
Name: link, dtype: int64

In [None]:
x = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['actor1'], data['actor2'])]

  """Entry point for launching an IPython kernel.


### Predicting

In [None]:
predictions = lr.predict_proba(np.array(x))
roc_auc_score(data['link'], predictions[:,1])

0.5666299048550896

In [None]:
y_pred = lr.predict(np.array(x))
accuracy_score(data['link'], y_pred)

0.5509164969450102

In [None]:
predictions = rfc.predict_proba(np.array(x))
roc_auc_score(data['link'], predictions[:,1])

0.5955508367353824

In [None]:
y_pred = rfc.predict(np.array(x))
accuracy_score(data['link'], y_pred)

0.595213849287169

In [None]:
predictions = mlpc.predict_proba(np.array(x))
roc_auc_score(data['link'], predictions[:,1])

0.5532881085731638

In [None]:
y_pred = mlpc.predict(np.array(x))
accuracy_score(data['link'], y_pred)

0.5585539714867617

## Outcome

In [None]:
data["y_pred"] = y_pred

In [None]:
mask = data["link"] != data["y_pred"]
data[mask]

Unnamed: 0,actor1,actor2,link,y_pred
44,Ajay Devgn,Ileana D'Cruz,0,1
110,Ammy Virk,Karamjit Anmol,0,1
261,Bob Glazier,Eight The Chosen One,0,1
554,Eric Roberts,Michael Madsen,0,1
616,Gurpreet Ghuggi,Karamjit Anmol,0,1
...,...,...,...,...
51526,Emma Roberts,Luke Bracey,1,0
51659,Anil Kapoor,Aditya Roy Kapoor,1,0
51713,James Carolus,Tonjia Atomic,1,0
51733,Peter Dinklage,Rosamund Pike,1,0
