Loading file and creating named entities per sentence.

In [3]:
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
import operator

inputFile = "./casts.csv"
delimitor = ";"

In graph

In [4]:
# input text
lines = None
with open(inputFile, 'r') as f:
    lines = f.readlines()
    
actors = []
movie = ""
G = nx.Graph()

#print(len(lines))

for line in lines:
    line = line.split(delimitor)
    if line[2][1:-1] == "s a": #some dummy names
        continue
    G.add_node(line[2][1:-1])
    if movie == line[1][1:-1]:
        for actor in actors:
            G.add_edge(line[2][1:-1], actor)
    else:
        actors = []
        movie = line[1][1:-1]
    
    actors.append(line[2][1:-1])

46233


Statistics

In [5]:
print("Number of nodes: " + str(G.number_of_nodes()))
print("Number of edges: " + str(G.number_of_edges()))
print("density: " +str(nx.density(G)))

print("Centralities:")

deg = sorted(nx.degree_centrality(G).items(), key=operator.itemgetter(1), reverse=True)
print("Deg done")
clo = sorted(nx.closeness_centrality(G).items(), key=operator.itemgetter(1), reverse=True)
print("Clo done")
bet = sorted(nx.betweenness_centrality(G).items(), key=operator.itemgetter(1), reverse=True)
print("Bet done")
eig = sorted(nx.eigenvector_centrality(G, 1000).items(), key=operator.itemgetter(1), reverse=True)
print("Eig done")

print("Degree: ")
for elem in deg[:10]:
    print(elem)
print("\nCloseness: ")
for elem in clo[:10]:
    print(elem)
print("\nBetweenness: ")
for elem in bet[:10]:
    print(elem)
print("\nEigenvector: ")
for elem in eig[:10]:
    print(elem)
print()

Number of nodes: 16615
Number of edges: 132233
density: 0.0009580657061085827
Centralities:
Deg done
Clo done
Bet done
Eig done
Degree: 
('Humphrey Bogart', 0.022511135187191524)
('John Carradine', 0.01932105453232214)
('James Stewart', 0.019080293728181052)
('Peter Lorre', 0.018358011315757795)
('Henry Fonda', 0.017996870109546165)
('Gary Cooper', 0.017816299506440352)
('John Gielgud', 0.017214397496087636)
('Cary Grant', 0.01691344649091128)
('Burt Lancaster', 0.01649211508366438)
('David Niven', 0.016251354279523293)

Closeness: 
('Burt Lancaster', 0.33066727073544955)
('John Gielgud', 0.3287796869323312)
('Henry Fonda', 0.32708078442643923)
('Charlton Heston', 0.32621293052348105)
('John Carradine', 0.32594682432951877)
('David Niven', 0.32448273124726207)
('Roddy McDowall', 0.3221283859823738)
('Paul Newman', 0.32100426854786007)
('Vincent Price', 0.3209720359674255)
('Robert Mitchum', 0.3207385432032701)

Betweenness: 
('Vincent Price', 0.013116264986132737)
('Burt Lancaster', 0.

In [44]:
precomputed_cliques = list(nx.find_cliques(G))
communities = list(nx.k_clique_communities(G,3, precomputed_cliques))
print("done")

done


Communities

In [113]:
print("Communities:")
#communities = {node:cid+1 for cid,community in enumerate(nx.k_clique_communities(G,3, precomputed_cliques)) for node in community}


i = 0
best = 0

for community in communities:
    #print(community)
    if len(community) > 0:
        i = i + 1
        if len(community) > best:
            bestC = i
            best = len(community)

print("Number of communities ", i)
#print(communities[bestC-1])
print("Largest community size: ", best)

Communities:
Number of communities  1529
Largest community size:  10968


Components

In [112]:
print("Components:")
components = nx.connected_components(G)

i = 0
for com in components:
    i = i +1
    if len(com)>100:
        Component = G.subgraph(com)
        
print("\nNumber of components", i)

print("Number of edges in max component: " + str(Component.number_of_edges()))
print("Density of max component: " +str(nx.density(Component)))



Components:

Number of components 844
Number of edges in max component: 129145
Density of max component: 0.0012157958852447596


Bacon number

In [110]:
from networkx import *

bacon_number={}
for a in G:
    try:
        bacon_number[a]=shortest_path_length(G,a,'Vincent Price')
    except:
        bacon_number[a]=1000000
        
bacon_numberC={}
for a in Component:
    bacon_numberC[a]=shortest_path_length(Component,a,'Vincent Price')

print(sorted(bacon_numberC.items(), key=operator.itemgetter(1), reverse=True)[:10])
print(sorted(bacon_numberC.items(), key=operator.itemgetter(1), reverse=False)[:10])
avgNum = sum(bacon_numberC[number] for number in bacon_numberC)/len(bacon_numberC)
print(avgNum)


[('Elisa Touati', 6), ('Alex VanWarmerdam', 6), ('Elsa Zylberstein', 6), ('Lam ChiCheung', 6), ('Carlos Lopez', 6), ('Maria deMederios', 6), ('Paredes', 6), ('Cecilia Roth', 6), ('Jose Sacristan', 6), ('Rudolf Lucieer', 6)]
[('Vincent Price', 0), ('Errol Flynn', 1), ('Glenn Langan', 1), ('Victor mature', 1), ('Dennis Hopper', 1), ('Brian Wilson', 1), ('Sandra Knight', 1), ('Patrick Mower', 1), ('Oh! Ogunde', 1), ('Richard Johnson', 1)]
2.7329857299670692


Saving to file

In [106]:
def toDict(listek):
    myDict = {}
    for l in listek:
        myDict[l[0]] = l[1]
    return myDict
    

set_node_attributes(G, "degree", toDict(deg))
set_node_attributes(G, "closeness", toDict(clo))
set_node_attributes(G, "betweenness", toDict(bet))
set_node_attributes(G, "eigenvector", toDict(eig))
set_node_attributes(G, "bacon", bacon_number)

In [107]:
write_gexf(G, "export.gexf")