In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
import community
import seaborn as sns
import random 
import numpy as np
import cython
from fa2 import ForceAtlas2

%matplotlib inline

In [2]:
name_basics = pd.read_csv("name.basics.tsv.gz", sep = "\t", na_values=["\\N"], index_col="nconst")

In [3]:
title_basics = pd.read_csv("title.basics.tsv.gz", sep = "\t", na_values=["\\N"], index_col="tconst")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
name_basics.head()

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0072308,tt0050419,tt0043044,tt0053137"
nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0117057,tt0071877,tt0038355"
nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,producer","tt0059956,tt0057345,tt0054452,tt0049189"
nm0000004,John Belushi,1949.0,1982.0,"actor,writer,soundtrack","tt0080455,tt0078723,tt0077975,tt0072562"
nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"


In [5]:
title_basics.head()

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0000001,short,Carmencita,Carmencita,0,1894.0,,1.0,"Documentary,Short"
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5.0,"Animation,Short"
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,,4.0,"Animation,Comedy,Romance"
tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,,"Animation,Short"
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1.0,Short


In [6]:
G_titles = nx.Graph()

for i,row in title_basics.iterrows():
    try:
        G_titles.add_node(i, title=row['primaryTitle'], genres=row['genres'].split(','))
    except:
        G_titles.add_node(i, title=row['primaryTitle'], genres=[])

for i,row in name_basics.iterrows():
    if type(row['knownForTitles'])==str:
        pairs = combinations(row['knownForTitles'].split(','), 2)
        for u,v in pairs:
            if G_titles.has_edge(u,v):
                G_titles[u][v]['weight'] = G_titles[u][v]['weight'] + 1
            else:
                G_titles.add_edge(u,v, weight=1)

In [7]:
partition = community.best_partition(G_titles)

In [8]:
#positioning
init_pos = { i : (random.random(), random.random()) for i in G_titles.nodes()}
forceatlas2 = ForceAtlas2(
                          # Behavior alternatives
                          outboundAttractionDistribution=False,  # Dissuade hubs
                          linLogMode=False,  # NOT IMPLEMENTED
                          adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                          edgeWeightInfluence=1.0,

                          # Performance
                          jitterTolerance=1.0,  # Tolerance
                          barnesHutOptimize=True,
                          barnesHutTheta=1.2,
                          multiThreaded=False,  # NOT IMPLEMENTED

                          # Tuning
                          scalingRatio=2.0,
                          strongGravityMode=False,
                          gravity=1.0,

                          # Log
                          verbose=True)
pos = forceatlas2.forceatlas2_networkx_layout(G_titles, init_pos, iterations=10) 

100%|██████████| 10/10 [4:11:37<00:00, 1509.71s/it] 


BarnesHut Approximation  took  394.45  seconds
Repulsion forces  took  14658.56  seconds
Gravitational forces  took  4.12  seconds
Attraction forces  took  10.54  seconds
AdjustSpeedAndApplyForces step  took  18.07  seconds


In [9]:
with open('layout', 'w') as f:
    f.write(str(pos))

In [None]:
from ast import literal_eval
with open('layout', 'w') as f:
    pos=literal_eval(f.read())

In [None]:
#drawing
plt.figure(figsize=(20,20))

size = len(set(partition.values()))
cmap = sns.color_palette(n_colors=size)
for i,com in enumerate(set(partition.values())) :
    list_nodes = [nodes for nodes in partition.keys()
                                if partition[nodes] == com]
    node_size = 5
    #node_size = [15 if 'Romance' in G_titles.nodes[n]['genres'] else 5 for n in list_nodes]
    nx.draw_networkx_nodes(G_titles, pos, list_nodes, node_size = node_size, node_color = cmap[i])

nx.draw_networkx_edges(G_titles, pos, alpha=0.5)
plt.savefig("G_titles.pdf", format = "pdf")
plt.show()