In [4]:
# Sources: 
# https://towardsdatascience.com/github-recommender-system-python-c8ff64dc83f4
# https://towardsdatascience.com/applied-network-analysis-using-python-25021633a702
# https://github.com/kurasaiteja/Github-Recommender-System/blob/master/Git.ipynb

!pip install networkx

[0m

In [5]:
import networkx as nx # https://pypi.org/project/networkx/ https://networkx.org/documentation/stable/tutorial.html
import project_example as pe
from collections import defaultdict


In [6]:
df=pe.load_data("active1000")
print("\nBasic statistics of the dataset...")
pe.statistics(df)


Basic statistics of the dataset...
Total number of events (front page incl.): 2207608
Total number of events (without front page): 788931
Total number of documents: 20344
Sparsity: 3.878%
Total number of events (drop duplicates): 679355
Sparsity (drop duplicates): 3.339%

Describe by user:
            counts
count  1000.000000
mean    679.355000
std     333.619737
min      59.000000
25%     506.750000
50%     639.500000
75%     797.500000
max    7958.000000


In [7]:
df

Unnamed: 0,eventId,category,activeTime,title,url,userId,publishtime,time,documentId
532786,1788714434,,3.0,,http://adressa.no,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,1483293370,
532788,1500434790,100sport|vintersport,17.0,Norges landslagssjef ville ha russisk leder ut...,http://adressa.no/100sport/langrenn_old/norges...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,2017-01-01T17:01:11.000Z,1483293374,70a19fd7c9f6827feb3eb4f3df95121664491fa7
532791,208227622,,3.0,,http://adressa.no,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,1483293392,
532794,1424389459,,3.0,Arsenal-spissens spektakulære scoring hylles: ...,http://adressa.no/100sport/fotball/arsenal-spi...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,1483293397,49b538a570b59c6fb564da7dfeace13ddd4f26f5
532796,1893920872,,5.0,,http://adressa.no,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,1483293401,
...,...,...,...,...,...,...,...,...,...
868714,105848760,pluss|okonomi,157.0,Rundt 2000 potensielle krav hindret gjeldsløsn...,http://adressa.no/pluss/okonomi/2017/03/31/run...,cx:ztquyfd3pug92dd4jpqzsvxfw:dixlo02oz5eu,2017-03-31T20:59:31.000Z,1490992822,c83af059757a837dc472d44a0eb3be8a0ad33e59
868807,1240144606,,,,http://adressa.no,cx:ztquyfd3pug92dd4jpqzsvxfw:dixlo02oz5eu,,1490993114,
869922,922427576,,11.0,,http://adressa.no,cx:ztquyfd3pug92dd4jpqzsvxfw:dixlo02oz5eu,,1490995847,
869925,1862687892,nyheter|moreromsdal,,16-årig sørtrønder døde etter kollisjon i Rindal,http://adressa.no/nyheter/moreromsdal/2017/03/...,cx:ztquyfd3pug92dd4jpqzsvxfw:dixlo02oz5eu,2017-03-31T22:01:46.000Z,1490995859,4f864db2e88b48315e1b0507bc1ee2b158b956ab


In [8]:
users = df["userId"].dropna().unique()
print(len(users))
articles = df["documentId"].dropna().unique()
print(len(articles))
reads = df[["userId", "documentId"]].dropna()
print(len(reads))

1000
20344
788931


In [9]:
#Creating a graph.
G = nx.Graph()

G.add_nodes_from(users, bipartite = 'Users' )

G.add_nodes_from(articles, bipartite = 'Articles' )

#Adding edges among nodes
edges = []
for index, row in reads.iterrows():
    #print(row['userId'], row['documentId'])
    edges.append((row["userId"], row["documentId"]))
# edges
G.add_edges_from(edges)

In [10]:
print(len(G.nodes()))
print(len(G.edges()))

21344
679355


In [11]:
def shared_partition_nodes(G, node1, node2):
    # Check that the nodes belong to the same partition
    assert G.nodes[node1]['bipartite'] == G.nodes[node2]['bipartite']

    # Get neighbors of node 1: nbrs1
    nbrs1 = G.neighbors(node1)
    # Get neighbors of node 2: nbrs2
    nbrs2 = G.neighbors(node2)

    # Compute the overlap using set intersections
    overlap = set(nbrs1).intersection(nbrs2)
    return overlap

# Print the number of shared repositories between users 'u7909' and 'u2148'
print(len(shared_partition_nodes(G, 'cx:i9i5zdr4ns9bm4ky:2by1rj0hmjgy', 'cx:iimz2wwcwxu7d721:2r8odp9zhg5yp')))

280


In [12]:
# Define get_nodes_from_partition()
def get_nodes_from_partition(G,partition):
    # Initialize an empty list for nodes to be returned
    nodes = []
    # Iterate over each node in the graph G
    for n in G.nodes():
        # Check that the node belongs to the particular partition
        if G.nodes[n]['bipartite'] == partition:
            # If so, append it to the list of nodes
            nodes.append(n)
    return nodes

# Print the number of nodes in the 'users' partition
print(len(get_nodes_from_partition(G, 'Users')))
print(len(get_nodes_from_partition(G, 'Articles')))

1000
20344


In [13]:
def user_similarity(G, user1, user2, proj_nodes):
    # Check that the nodes belong to the 'users' partition
    assert G.nodes[user1]['bipartite'] == 'Users'
    assert G.nodes[user2]['bipartite'] == 'Users'

    # Get the set of nodes shared between the two users
    shared_nodes = shared_partition_nodes(G, user1, user2)

    # Return the fraction of nodes in the projects partition
    return len(shared_nodes) / len(proj_nodes)

# Compute the similarity score between users 'u4560' and 'u1880'
project_nodes = get_nodes_from_partition(G, 'Articles')
similarity_score = user_similarity(G, 'cx:i9i5zdr4ns9bm4ky:2by1rj0hmjgy', 'cx:ib1vo01vq38f2mqc:20lut6o1pv35i', project_nodes)

print(similarity_score)

0.032294534014942984


In [14]:
def most_similar_users(G, user, user_nodes, proj_nodes):
    # Data checks
    assert G.nodes[user]['bipartite'] == 'Users'

    # Get other nodes from user partition
    user_nodes = set(user_nodes)
    user_nodes.remove(user)

    # Create the dictionary: similarities
    similarities = defaultdict(list)
    for n in user_nodes:
        similarity = user_similarity(G, user, n, proj_nodes)
        similarities[similarity].append(n)

    # Compute maximum similarity score: max_similarity
    max_similarity = max(similarities.keys())

    # Return list of users that share maximal similarity
    return similarities[max_similarity]

user_nodes = get_nodes_from_partition(G, 'Users')
project_nodes = get_nodes_from_partition(G, 'Articles')

print(most_similar_users(G, 'cx:i9i5zdr4ns9bm4ky:2by1rj0hmjgy', user_nodes, project_nodes))

['cx:ib1vo01vq38f2mqc:20lut6o1pv35i']


In [15]:
def recommend_repositories(G, from_user, to_user):
    # Get the set of repositories that from_user has contributed to
    from_articles = set(G.neighbors(from_user))
    # Get the set of repositories that to_user has contributed to
    to_articles = set(G.neighbors(to_user))

    # Identify repositories that the from_user is connected to that the to_user is not connected to
    return from_articles.difference(to_articles)

# Print the repositories to be recommended
print(recommend_repositories(G, 'cx:i9i5zdr4ns9bm4ky:2by1rj0hmjgy', 'cx:ib1vo01vq38f2mqc:20lut6o1pv35i'))

{'b085b71659a0b691fe8cf7d25a267629a1d3e327', 'caa00adac6fb490860c069e14a044af3bb3f8b76', '053e888fe61b192854e848f8b285114f53e08f9d', '0cbfb4d0936054bddd5b1b2177e93a9dce5bd6de', '134aca17d8e7954f5bc95779e6c865c731700ada', '073ceafc7acf0a3b7479ba257af44b62b6fbf5a5', 'f09ecebee85eee7fd5a3f14a9c4dcb796341f8ff', '0a63ea3fa0e92d8a1a294acd8887a3c99fe48826', '182bdde37d54dbdb6deb4e1472e96aeac604bc66', '91efa7b44782b7fe140e8ffc71676bfa29f1acc2', '3f2757f24f429c7b70325c5749339558dd644a1c', 'bbb92ab6e050e5b71fa213f2271e51f8dc54dd6b', '6eccd6eec73be9007498733bc83716f67cb7198f', 'c47f63e7e6d046709c8222e5baebec3ba19e1ae8', '7ded306a0177c2508799e7a11b89757670be41f6', '223eb071b72e3585093bfd6f4935f901a4d8e052', '675955e11136d4f1bc80bb10b0d5d7fd8481c2b7', '4f17aa740c5795c6cb59059e37bb21b15f914e26', '942346502d9cfa85caf4be3fd94af79c20ff4765', '2c62bb35352f115ef9bca1aed891e476f5e7731f', '7b6f08de1e036fe825fa4f9b11bfa40bc0d54245', '5f01fff720fa19c67de886350dbb5322d1c18cc1', '4d9116666daff22522ea536aeb0971