# Linear Regression Link Prediction

> Kaggle Score = 0.76014

## 1. Imports

In [1]:
import pandas as pd
import networkx as nx
import sklearn
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

## 2. Feature Generation

In [2]:
def loadTrainDataAsUndirectedGraph():
    filename = "train.txt"
    rows = [line.rstrip("\n") for line in open(filename)]
    g = nx.Graph()
    for row in rows:
        authorIds = row.split()
        for i, author in enumerate(authorIds):
            for coauthor in authorIds[i+1:]:
                if g.has_edge(author, coauthor):
                    g[author][coauthor]['frequency'] += 1
                else:
                    g.add_edge(author, coauthor, frequency=1)
    return g

In [3]:
def createBalancedData(graph):
    allAuthors = list(graph.nodes)
    trueInstances = []
    falseInstances = []
    print('Authors to iterate:',len(allAuthors) )
    
    # Get true instances and false instances
    for i, author1 in enumerate(allAuthors):
        if(i%100==0): print('.', end='') # Status indicator
        for author2 in allAuthors[i+1:]:
            if graph.has_edge(author1, author2):
                freq = graph[author1][author2]['frequency']
                if freq > 1:
                    # True instances
                    trueInstances.append({
                        "Source": author1,
                        "Sink": author2,
                        "Freq": graph[author1][author2]['frequency'],
                        "Coauthor": 1 })
            else:
                # False instances (instances that are more than 3 hops away)
                author1Neighbors = list(nx.all_neighbors(graph, author1))
                author2Neighbors = list(nx.all_neighbors(graph, author2))
                if (len(author1Neighbors)==1 and len(author2Neighbors)==1):
                    common = len(list(nx.common_neighbors(graph, author1Neighbors[0], author2Neighbors[0])))
                    if common == 0:
                        falseInstances.append({
                            "Source": author1,
                            "Sink": author2,
                            "Freq": 0,
                            "Coauthor": 0 })
    print('')
    
    # Downsample false instances
    numTrue = len(trueInstances)
    falseInstances = sklearn.utils.resample(
                        falseInstances,
                        n_samples=numTrue,
                        random_state=0)  # TODO: Hardcoded random state
    
    print('Generated',numTrue,'instances where coauthor=True')
    print('Generated',len(falseInstances),'instances where coauthor=False')
    
    # Shuffle training data
    data = sklearn.utils.shuffle(trueInstances+falseInstances, random_state=0) # TODO: Hardcoded random state
    return pd.DataFrame(data=data)

In [4]:
def shortestDistance(graph, n1, n2):
    try: return len(nx.dijkstra_path(graph, source=n1, target=n2)) - 1
    except: return 100

In [5]:
def commonNeighbours(graph, n1, n2):
    try: return len(list(nx.common_neighbors(graph, n1, n2)))
    except: return 0

In [6]:
def jaccard(graph, n1, n2):
    try: return list(nx.jaccard_coefficient(g, [(n1, n2)]))[0][2]
    except: return 0

In [7]:
def adamicAdar(graph, n1, n2):
    try: return list(nx.adamic_adar_index(g, [(n1, n2)]))[0][2]
    except: return 0

In [8]:
def preferentialAttachment(graph, n1, n2):
    try: return list(nx.preferential_attachment(g, [(n1, n2)]))[0][2]
    except: return 0

In [9]:
def resourceAllocation(graph, n1, n2):
    try: return list(nx.resource_allocation_index(g, [(n1, n2)]))[0][2]
    except: return 0

In [10]:
g = loadTrainDataAsUndirectedGraph()
dt = createBalancedData(g)
dt['Dist'] = dt.apply(lambda l: shortestDistance(g,l.Source,l.Sink),axis=1)
print('Added "Dist" column')
dt['CommonNeighbours'] = dt.apply(lambda l: commonNeighbours(g,l.Source,l.Sink),axis=1)
print('Added "CommonNeighbours" column')
dt['Jaccard'] = dt.apply(lambda l: jaccard(g,l.Source,l.Sink),axis=1)
print('Added "Jaccard" column')
dt['AdamicAdar'] = dt.apply(lambda l: adamicAdar(g,l.Source,l.Sink),axis=1)
print('Added "AdamicAdar" column')
dt['PreferentialAttachment'] = dt.apply(lambda l: preferentialAttachment(g,l.Source,l.Sink),axis=1)
print('Added "PreferentialAttachment" column')
dt['ResourceAllocation'] = dt.apply(lambda l: resourceAllocation(g,l.Source,l.Sink),axis=1)
print('Added "ResourceAllocation" column')

Authors to iterate: 3767
......................................
Generated 4813 instances where coauthor=True
Generated 4813 instances where coauthor=False
Added "Dist" column
Added "CommonNeighbours" column
Added "Jaccard" column
Added "AdamicAdar" column
Added "PreferentialAttachment" column
Added "ResourceAllocation" column


In [11]:
dt.head()

Unnamed: 0,Source,Sink,Freq,Coauthor,Dist,CommonNeighbours,Jaccard,AdamicAdar,PreferentialAttachment,ResourceAllocation
0,1904,2253,0,0,6,0,0.0,0.0,1,0.0
1,1497,2589,0,0,100,0,0.0,0.0,1,0.0
2,3785,114,6,1,1,2,0.028169,1.111219,210,0.326923
3,1674,1718,0,0,100,0,0.0,0.0,1,0.0
4,2825,2141,12,1,1,17,0.68,5.550508,440,0.846625


## 3. Training + Cross Validation

In [12]:
trainX = dt[['Dist', 'CommonNeighbours',
       'Jaccard', 'AdamicAdar', 'PreferentialAttachment',
       'ResourceAllocation']]
trainY = dt['Coauthor']

In [13]:
clf = LinearRegression()
scores = cross_val_score(clf, trainX, trainY, cv=5)
scores

array([0.63413772, 0.64163563, 0.65436054, 0.63722995, 0.64275272])

In [14]:
clf.fit(trainX,trainY)
print('intercept = ', clf.intercept_)
trainedModel = pd.DataFrame(clf.coef_, trainX.columns, columns=['Coefficients'])
trainedModel

intercept =  0.46915255874776407


Unnamed: 0,Coefficients
Dist,-0.004852
CommonNeighbours,-0.094472
Jaccard,0.839887
AdamicAdar,0.373969
PreferentialAttachment,9.6e-05
ResourceAllocation,-0.323644


## 4. Generate Output

In [15]:
test = pd.read_csv('test-public.csv', converters = {'Source': str, 'Sink': str})
test['Dist'] = test.apply(lambda l: shortestDistance(g,l.Source,l.Sink),axis=1)
print('Added "Dist" column')
test['CommonNeighbours'] = test.apply(lambda l: commonNeighbours(g,l.Source,l.Sink),axis=1)
print('Added "CommonNeighbours" column')
test['Jaccard'] = test.apply(lambda l: jaccard(g,l.Source,l.Sink),axis=1)
print('Added "Jaccard" column')
test['AdamicAdar'] = test.apply(lambda l: adamicAdar(g,l.Source,l.Sink),axis=1)
print('Added "AdamicAdar" column')
test['PreferentialAttachment'] = test.apply(lambda l: preferentialAttachment(g,l.Source,l.Sink),axis=1)
print('Added "PreferentialAttachment" column')
test['ResourceAllocation'] = test.apply(lambda l: resourceAllocation(g,l.Source,l.Sink),axis=1)
print('Added "ResourceAllocation" column')
test.head()

Added "Dist" column
Added "CommonNeighbours" column
Added "Jaccard" column
Added "AdamicAdar" column
Added "PreferentialAttachment" column
Added "ResourceAllocation" column


Unnamed: 0,Id,Source,Sink,Dist,CommonNeighbours,Jaccard,AdamicAdar,PreferentialAttachment,ResourceAllocation
0,1,0,2917,3,0,0.0,0.0,56,0.0
1,2,0,2956,5,0,0.0,0.0,24,0.0
2,3,1,4038,3,0,0.0,0.0,496,0.0
3,4,2,1848,1,2,0.08,1.24267,72,0.4
4,5,3,513,4,0,0.0,0.0,391,0.0


In [16]:
testX = test[['Dist', 'CommonNeighbours',
       'Jaccard', 'AdamicAdar', 'PreferentialAttachment',
       'ResourceAllocation']]

In [17]:
def truncateZeroAndOne(n):
    if n>1: return 1
    elif n<0: return 0
    else: return n

test['Predicted'] = clf.predict(testX)
test['Predicted'] = test.apply(lambda l: truncateZeroAndOne(l.Predicted), axis=1)

In [18]:
test[['Id', 'Predicted']].to_csv('results/LinearRegression.csv', index=False)