In [None]:
import os
import urllib.request
import gzip
import re
import pandas as pd
import multiprocessing as mp
import networkx as nx
import matplotlib.pyplot as plt
import collections
import seaborn as sns
from matplotlib.patches import Rectangle
import math
import numpy as np
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# set directory
data_directory = "/content/drive/MyDrive/Colab_Notebooks/network_project/data"

### Data Preprocessing

In [None]:
# get links from gz file
def get_movie_links(filename):
    regex_movie = re.compile(r'^([^\"][^\(]+ \(\d+\))$') # (remove", remove(, take(1234))
    regex_links = re.compile(r"\((features|references|follows|spoofs|remake of|spin off from) ([^\"][^\(]+ \(\d+\))\)")
    result = {"movie": [], "cites": []}
    state = 0 # read movie or reference
    movie = ""
    with gzip.open(filename) as f:
        for line in f:
            line = line.decode("ISO-8859-1")
            if state == 0:  # read movie
                res = re.findall(regex_movie, line)
                if res != []:
                    movie = res[0]
                    state = 1
            elif state == 1: # read reference
                if line == "\n":
                    state = 0
                else:
                    res = re.findall(regex_links, line)
                    if res != []:
                        cites = res[0][1]
                        if(get_year_single_movie(movie) >= get_year_single_movie(cites)):
                            result["movie"] += [movie]
                            result["cites"] += [cites]
    return pd.DataFrame(result, columns = ["movie", "cites"])

# get countries from gz file
def get_movie_countries(filename):
    regex_movie_country = re.compile(r'^([^\"][^\(]+ \(\d+\))\t+(.+)$')
    result = {"movie": [], "country": []}
    with gzip.open(filename) as f:
        for line in f:
            line = line.decode("ISO-8859-1")
            res = re.findall(regex_movie_country, line)
            if res != []:
                result["movie"] += [res[0][0]]
                result["country"] += [res[0][1]]
    return pd.DataFrame(result, columns = ["movie", "country"])

# get genres from gz file
def get_movie_genres(filename):
    regex_movie_genre = re.compile(r'^([^\"][^\(]+ \(\d+\))\t+(.+)$')
    result = {"movie": [], "genre": []}
    with gzip.open(filename) as f:
        for line in f:
            line = line.decode("ISO-8859-1")
            res = re.findall(regex_movie_genre, line)
            if res != []:
                result["movie"] += [res[0][0]]
                result["genre"] += [res[0][1]]
    return pd.DataFrame(result, columns = ["movie", "genre"])

# get year from title
def get_year_single_movie(title):
    return int(title[-5:-1])

# groupby function
def get_grouped(df, y_column, sort_by, ascending=False, limit=None):
    if limit == None:
        return df.groupby(y_column).count().reset_index().sort_values(by = sort_by, ascending = ascending)
    else:
        return df.groupby(y_column).count().reset_index().sort_values(by = sort_by, ascending = False)[:limit].sort_values(by = sort_by, ascending = ascending)


# get dataframe without all the items with the tag in tag_value
def get_df_dropped(df, id_column, tag_column, tag_value):
    local_df = df[df[tag_column] == tag_value][[id_column]]
    local_df["remove"] = True
    local_df = df.merge(local_df, how="left")
    return local_df[local_df["remove"] != True][[id_column,tag_column]]

In [None]:
def save_all_dataframes(input_dir="", output_dir=""):
    # read gzip files
    movie_links = get_movie_links(input_dir+"/"+"movie-links.list.gz")
    movie_countries = get_movie_countries(input_dir+"/"+"countries.list.gz")
    movie_genres = get_movie_genres(input_dir+"/"+"genres.list.gz")

    # remove from genres the Short and Adult movies
    movie_genres = get_df_dropped(get_df_dropped(movie_genres, "movie", "genre", "Short"), "movie", "genre", "Adult")

    #select only items that intersect all dataframes
    movie_title = movie_links[["cites"]].drop_duplicates()
    movie_title.columns = ["movie"]
    movie_title = movie_title.append(movie_links[["movie"]].drop_duplicates()).drop_duplicates()
    movie_title = movie_title.merge(movie_countries, how="inner")[["movie"]].drop_duplicates()
    movie_title = movie_title.merge(movie_genres, how="inner")[["movie"]].drop_duplicates()

    #create id for each movie
    movie_title = movie_title.sort_values(by = "movie", ascending = True).reset_index()
    movie_title["id"] = movie_title.index
    movie_title.columns = ["index","title", "id"]
    movie_title = movie_title[["id", "title"]]

    # create links df
    movie_links = movie_links.merge(movie_title, left_on="movie", right_on="title", how="inner")
    movie_links = movie_links.merge(movie_title, left_on="cites", right_on="title", how="inner", suffixes=["_from", "_to"])[["id_from", "id_to"]]
    movie_links.columns = ["id", "cites"]

    # create countries df
    movie_countries = movie_countries.merge(movie_title, left_on="movie", right_on="title", how="inner")[["id", "country"]]

    # create genres df
    movie_genres = movie_genres.merge(movie_title, left_on="movie", right_on="title", how="inner")[["id", "genre"]]

    # create years df
    movie_year = movie_title.copy()
    movie_year.loc[:,"year"] = movie_year["title"].apply(lambda x: int(x[-5:-1]))
    movie_year = movie_year[["id", "year"]]

    # export dataframes on drive
    movie_title.to_csv(output_dir+"/titles.csv", index=False)
    movie_year.to_csv(output_dir+"/years.csv", index=False)
    movie_genres.to_csv(output_dir+"/genres.csv", index=False)
    movie_countries.to_csv(output_dir+"/country.csv", index=False)
    movie_links.to_csv(output_dir+"/links.csv", index=False)
    print("All done!")
    return

In [None]:
# create dataframes from raw
save_all_dataframes(input_dir=data_directory, output_dir=data_directory)

  movie_title = movie_title.append(movie_links[["movie"]].drop_duplicates()).drop_duplicates()


All done!


In [None]:
# read a list of csv
def load_data(filename_list):
    res = []
    for filename in filename_list:
        res += [pd.read_csv(filename, encoding = "ISO-8859-1")]
    return res

In [None]:
[movie_title, movie_year, movie_genres, movie_countries, movie_links] = load_data([data_directory+"/titles.csv", data_directory+"/years.csv", data_directory+"/genres.csv", data_directory+"/country.csv", data_directory+"/links.csv"])

In [None]:
movie_links

Unnamed: 0,id,cites
0,0,37546
1,7,37546
2,45,37546
3,49,37546
4,99,37546
...,...,...
134395,50466,50465
134396,50468,50467
134397,50470,50469
134398,50474,24166


In [None]:
movie_year

Unnamed: 0,id,year
0,0,2016
1,1,2013
2,2,1935
3,3,2008
4,4,2012
...,...,...
50471,50471,1976
50472,50472,1984
50473,50473,2015
50474,50474,2012


In [None]:
movie_genres

Unnamed: 0,id,genre
0,0,Animation
1,0,Comedy
2,0,Drama
3,0,Fantasy
4,1,Action
...,...,...
110083,50473,Drama
110084,50474,Action
110085,50474,Adventure
110086,50474,Comedy


In [None]:
movie_title

Unnamed: 0,id,title
0,0,#TubeClash02 (2016)
1,1,#chicagoGirl: The Social Network Takes on a Di...
2,2,$10 Raise (1935)
3,3,$5 a Day (2008)
4,4,$ellebrity (2012)
...,...,...
50471,50471,ÃÃ§ kagitÃ§ilar (1976)
50472,50472,ÃÃ§ sÃ¼permen olimpiyatlarda (1984)
50473,50473,Ãrestir (2015)
50474,50474,Ã¨ solo questione di punti di vista (2012)


### Centrality Analysis - Part I

In [None]:
title_dict = dict()
for id, title in zip(movie_title["id"], movie_title["title"]):
  title_dict[id] = title

In [None]:
# Creating the graph
nodes = movie_title["id"]

edges = []
for x, y in zip(movie_links["id"], movie_links["cites"]):
    edges.append((x, y))


# Graph Creation
G = nx.DiGraph() # Directed graph
G.add_nodes_from(nodes)
G.add_edges_from(edges)

print(G)

DiGraph with 50476 nodes and 131566 edges


In [None]:
nodes_todel = []

for nodes in G.nodes:
  if G.degree(nodes) < 1:
    nodes_todel.append(nodes)

G.remove_nodes_from(nodes_todel)
print(G)

DiGraph with 48005 nodes and 131566 edges


In [None]:
nodes_todel = []

for nodes in G.nodes:
  if G.degree(nodes) <= 1:
    nodes_todel.append(nodes)

G.remove_nodes_from(nodes_todel)
print(G)

DiGraph with 26717 nodes and 113650 edges


In [None]:
nodes_todel = []

for nodes in G.nodes:
  if G.degree(nodes) <= 2:
    nodes_todel.append(nodes)

G.remove_nodes_from(nodes_todel)
print(G)

DiGraph with 18795 nodes and 101450 edges


In [None]:
nodes_todel = []

for nodes in G.nodes:
  if G.degree(nodes) <= 3:
    nodes_todel.append(nodes)

G.remove_nodes_from(nodes_todel)
print(G)

DiGraph with 14463 nodes and 91650 edges


In [None]:
degree = nx.degree_centrality(G)
sorted_degree = sorted(degree.items(), key=lambda x:x[1], reverse=True)

for i, (x, y) in enumerate(sorted_degree):
  if i < 10:
    print(i+1, title_dict[x])
  else:
    break

1 Star Wars (1977)
2 Rewind This! (2013)
3 The Wizard of Oz (1939)
4 Adjust Your Tracking (2013)
5 Film Geek (2005)
6 Psycho (1960)
7 Video Nasties: Moral Panic, Censorship & Videotape (2010)
8 Be Kind Rewind (2008)
9 The Godfather (1972)
10 Pulp Fiction (1994)


In [None]:
eigen_vector = nx.eigenvector_centrality(G)
sorted_eigen = sorted(eigen_vector.items(), key=lambda x:x[1], reverse=True)

for i, (x, y) in enumerate(sorted_eigen):
  if i < 10:
    print(i+1, title_dict[x])
  else:
    break

1 The House Without a Key (1926)
2 The Chinese Parrot (1927)
3 Behind That Curtain (1929)
4 Charlie Chan Carries On (1931)
5 The Black Camel (1931)
6 Charlie Chan's Chance (1932)
7 Charlie Chan's Greatest Case (1933)
8 Charlie Chan's Courage (1934)
9 Charlie Chan in London (1934)
10 Charlie Chan in Paris (1935)


In [None]:
katz = nx.katz_centrality(G)
sorted_katz = sorted(katz.items(), key=lambda x:x[1], reverse=True)

for i, (x, y) in enumerate(sorted_katz):
  if i < 10:
    print(i+1, title_dict[x])
  else:
    break

1 The Wizard of Oz (1939)
2 Star Wars (1977)
3 Psycho (1960)
4 2001: A Space Odyssey (1968)
5 Metropolis (1927)
6 The Three Mesquiteers (1936)
7 Citizen Kane (1941)
8 Ghost-Town Gold (1936)
9 King Kong (1933)
10 The Godfather (1972)


### Centrality Analysis - Part II

In [None]:
def normalize_pairs(list_of_pairs):
    max_value = float(max([value for key,value in list_of_pairs]))
    return [(key,value/max_value) for key,value in list_of_pairs]

def calculate_and_save_centralities(input_file, output_file, by):
    df = pd.read_csv(input_file)
    G = nx.from_pandas_edgelist(df, df.columns[0], df.columns[1], create_using=nx.DiGraph())
    print("Calculating in-degree centrality...")
    df = pd.DataFrame(normalize_pairs(nx.in_degree_centrality(G).items()), columns=[df.columns[0], "in_degree"])
    print("Calculating pagerank centrality...")
    df = df.merge(pd.DataFrame(normalize_pairs(nx.pagerank(G).items()), columns=[df.columns[0], "pagerank"]))
    print("Calculating katz centrality...")
    df = df.merge(pd.DataFrame(normalize_pairs(nx.katz_centrality(G).items()), columns=[df.columns[0], "katz"]))
    print("Calculating eigenvector centrality...")
    df = df.merge(pd.DataFrame(normalize_pairs(nx.eigenvector_centrality(G).items()), columns=[df.columns[0], "eigenvector_left"]))
    print("Calculating eigenvector centrality...")
    df = df.merge(pd.DataFrame(normalize_pairs(nx.eigenvector_centrality(G.reverse(), max_iter=200).items()), columns=[df.columns[0], "eigenvector_right"]))
    df = df.sort_values(by=by, ascending=False).reset_index(drop=True)
    # save on file
    print("Saving centralities...")
    df.to_csv(output_file, index=False)
    print("...all done!")
    return

In [None]:
# calculate all centrality scores, and save on file
calculate_and_save_centralities(data_directory+"/links.csv", data_directory+"/centralities.csv", "pagerank")

Calculating in-degree centrality...
Calculating pagerank centrality...
Calculating katz centrality...
Calculating eigenvector centrality...
Calculating eigenvector centrality...
Saving centralities...
...all done!


In [None]:
# read list of csv about movies
def load_data_movies(input_dir):
    return load_data([input_dir+"/titles.csv", input_dir+"/centralities.csv", input_dir+"/years.csv", input_dir+"/genres.csv", input_dir+"/country.csv"])

def get_grouped(df, y_column, sort_by, ascending):
    return df.groupby(y_column).count().reset_index().sort_values(by = sort_by, ascending = ascending)

def get_rank_sorted(titles_csv, centralities_csv, year_csv, genres_csv, countries_csv, centrality_col, by, countries_list=[], genres_list=[], years_list=[]):
    num_col_centralities = 2
    res = centralities_csv.merge(titles_csv)
    #res = res[["title"]+list(res.columns[1:num_col_centralities])+["id"]]
    res = res[["title"]+list(res.columns[centrality_col:centrality_col+1])+["id"]+["rank"]]
    if countries_list != []:
        res = res.merge(countries_csv[countries_csv["country"].isin(countries_list)])
    if genres_list != []:
        res = res.merge(genres_csv[genres_csv["genre"].isin(genres_list)])
    if years_list != []:
        res = res.merge(year_csv[year_csv["year"].isin(years_list)])
    return res[res.columns].drop_duplicates().sort_values(by=by, ascending=False)

def print_movie_rank(output_dir, centrality, centrality_col, by):
    res = get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col, by)[["id","rank","title", centrality]]
    res = res.merge(movie_year)
    res = res.merge(movie_countries.groupby("id")["country"].apply(list).reset_index())
    res = res.merge(movie_genres.groupby("id")["genre"].apply(list).reset_index())
    res.rename(index=str, columns={"country": "countries", "genre": "genres"}, inplace=True)
    res.drop("id", axis=1).to_csv(output_dir+"/films_ranking_" + centrality +".csv", index=False, sep='\t')
    return

In [None]:
# get data on movies saved in the analysis section
[movie_title, movie_centralities, movie_year, movie_genres, movie_countries] = load_data_movies(data_directory)
movie_centralities["rank"] = 1+movie_centralities.index
print(len(movie_centralities), "movies on dataset")

48005 movies on dataset


#### PageRank Centrality

In [None]:
print_movie_rank(data_directory, "pagerank", centrality_col = 2, by = "pagerank")
top_rank = 20
print("Most influential "+str(top_rank)+" movies of all times")
get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 2, by = "pagerank")[:top_rank].merge(movie_countries.groupby("id")["country"].apply(list).reset_index())

Most influential 20 movies of all times


Unnamed: 0,title,pagerank,id,rank,country
0,The Wizard of Oz (1939),1.0,45045,1,[USA]
1,Star Wars (1977),0.644888,37546,2,[USA]
2,The Birth of a Nation (1915),0.570123,39843,3,[USA]
3,Psycho (1960),0.564833,32446,4,[USA]
4,Cabiria (1914),0.555953,6223,5,[Italy]
5,King Kong (1933),0.552475,21132,6,[USA]
6,Metropolis (1927),0.454093,26688,7,[Germany]
7,Snow White and the Seven Dwarfs (1937),0.424154,36808,8,[USA]
8,2001: A Space Odyssey (1968),0.423516,274,9,"[UK, USA]"
9,Frankenstein (1931),0.403745,14047,10,[USA]


#### In-degree Centrality

In [None]:
print_movie_rank(data_directory, "in_degree", centrality_col = 1, by = "in_degree")
print("Most influential "+str(top_rank)+" movies of all time by in_degree centrality")
get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 1, by = "in_degree")[:top_rank].merge(movie_countries.groupby("id")["country"].apply(list).reset_index())

Most influential 20 movies of all time by in_degree centrality


Unnamed: 0,title,in_degree,id,country
0,Star Wars (1977),1.0,37546,[USA]
1,The Wizard of Oz (1939),0.837423,45045,[USA]
2,Psycho (1960),0.574642,32446,[USA]
3,The Godfather (1972),0.472393,41346,[USA]
4,Jaws (1975),0.447853,19552,[USA]
5,Casablanca (1942),0.411043,6659,[USA]
6,2001: A Space Odyssey (1968),0.383436,274,"[UK, USA]"
7,Gone with the Wind (1939),0.377301,15452,[USA]
8,The Shining (1980),0.377301,44078,"[UK, USA]"
9,Taxi Driver (1976),0.354806,38934,[USA]


#### Katz Centrality

In [None]:
print_movie_rank(data_directory, "katz", centrality_col = 3, by = "katz")
print("Most influential "+str(top_rank)+" movies of all time by katz centrality")
get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 3, by = "katz")[:top_rank].merge(movie_countries.groupby("id")["country"].apply(list).reset_index())

Most influential 20 movies of all time by katz centrality


Unnamed: 0,title,katz,id,country
0,The Wizard of Oz (1939),1.0,45045,[USA]
1,Star Wars (1977),0.920272,37546,[USA]
2,Psycho (1960),0.818661,32446,[USA]
3,2001: A Space Odyssey (1968),0.69964,274,"[UK, USA]"
4,Metropolis (1927),0.570644,26688,[Germany]
5,The Three Mesquiteers (1936),0.484814,44561,[USA]
6,Citizen Kane (1941),0.474699,7548,[USA]
7,Ghost-Town Gold (1936),0.440366,15005,[USA]
8,King Kong (1933),0.430023,21132,[USA]
9,The Godfather (1972),0.428304,41346,[USA]


#### Eigenvector Centrality (Movies that are referenced the most by others - have a lot of in-degree links)



In [None]:
print_movie_rank(data_directory, "eigenvector_left", centrality_col = 4, by = "eigenvector_left")
print("Most influential "+str(top_rank)+" movies of all time by eigenvector_left centrality")
get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 4, by = "eigenvector_left")[:top_rank].merge(movie_countries.groupby("id")["country"].apply(list).reset_index())

Most influential 20 movies of all time by eigenvector_left centrality


Unnamed: 0,title,eigenvector_left,id,country
0,The House Without a Key (1926),1.0,41752,[USA]
1,The Chinese Parrot (1927),0.710326,40262,[USA]
2,Behind That Curtain (1929),0.502792,4231,[USA]
3,Charlie Chan Carries On (1931),0.354615,6961,[USA]
4,The Black Camel (1931),0.249189,39859,[USA]
5,Charlie Chan's Chance (1932),0.174449,6981,[USA]
6,Charlie Chan's Greatest Case (1933),0.121657,6983,[USA]
7,Charlie Chan's Courage (1934),0.084507,6982,[USA]
8,Charlie Chan in London (1934),0.058466,6972,[USA]
9,Charlie Chan in Paris (1935),0.040284,6974,[USA]


#### Eigenvector Centrality (Movies that reference other movies the most - have a lot of out-degree links)

In [None]:
print_movie_rank(data_directory, "eigenvector_right", centrality_col = 5, by = "eigenvector_right")
print("Most influential "+str(top_rank)+" movies of all time by eigenvector_right centrality")
get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 5, by = "eigenvector_right")[:top_rank].merge(movie_countries.groupby("id")["country"].apply(list).reset_index())

Most influential 20 movies of all time by eigenvector_right centrality


Unnamed: 0,title,eigenvector_right,id,country
0,Guardians of the Galaxy Vol. 2 (2017),1.0,15773,"[USA, New Zealand, Canada]"
1,Thor: Ragnarok (2017),0.551305,45386,[USA]
2,Untitled Spider-Man: Homecoming Sequel (2019),0.512517,47226,[USA]
3,Spider-Man: Homecoming (2017),0.511082,37300,[USA]
4,Doctor Strange (2016),0.367207,10701,[USA]
5,"The Adventures of the Fatbat Episode III, Quee...",0.341729,39435,[USA]
6,Black Panther (2018),0.341159,4884,[USA]
7,Captain America: Civil War (2016),0.340349,6453,"[USA, Germany]"
8,Untitled Avengers Movie (2019),0.314082,47218,[USA]
9,Ralph Breaks the Internet: Wreck-It Ralph 2 (2...,0.158645,32998,[USA]


### Most Influential Movies by Genre

In [None]:
#calculate and print top ranking of most influential movies by genre
def top_movies_by_genre(top_size):
    for genre in get_grouped(movie_genres, "genre", "id", False)[:20]["genre"]:
        print("\n## Genre: ", genre, "##")
        temp = get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 2, by = "pagerank", genres_list=[genre])[:top_size][["title","rank"]]
        print(temp)
    return

In [None]:
#print top ranking of most influential movies by genre
top_size = 10

print("Most influential", top_size, "movies by genre")
print("\nThe dataset contains "+str(len(movie_genres))+" 'genre' entries, divided into "+str(len(movie_genres[["genre"]].drop_duplicates()))+" unique genres")

top_movies_by_genre(top_size)

Most influential 10 movies by genre

The dataset contains 110088 'genre' entries, divided into 25 unique genres

## Genre:  Drama ##
                          title  rank
0  The Birth of a Nation (1915)     3
1                Cabiria (1914)     5
2             Metropolis (1927)     7
3           Frankenstein (1931)    10
4           Citizen Kane (1941)    12
5             Casablanca (1942)    13
6                   Jaws (1975)    15
7          The Godfather (1972)    16
8     Gone with the Wind (1939)    17
9          The Searchers (1956)    18

## Genre:  Comedy ##
                                               title  rank
0                               La dolce vita (1960)    20
1                            The Wizard of Oz (1925)    33
2  Dr. Strangelove or: How I Learned to Stop Worr...    35
3                             Sh! The Octopus (1937)    36
4                            La rÃ¨gle du jeu (1939)    37
5                   The Poor Little Rich Girl (1917)    38
6             

### Most Influential Movies by Country

In [None]:
#calculate and print top ranking of most influential movies by country
def top_movies_by_country(top_size):
    for country in get_grouped(movie_countries, "country", "id", False)[:20]["country"]:
        print("\n## Country: ", country, "##")
        temp = get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 2, by = "pagerank", countries_list=[country])[:top_size][["title","rank"]]
        print(temp)

In [None]:
#print top ranking of most influential movies by country
top_size = 10

print("Most influential", top_size, "movies by country")
print("\nThe dataset contains "+str(len(movie_countries))+" 'country' entries, divided into "+str(len(movie_countries[["country"]].drop_duplicates()))+" unique countries")

top_movies_by_country(top_size)

Most influential 10 movies by country

The dataset contains 60858 'country' entries, divided into 159 unique countries

## Country:  USA ##
                                    title  rank
0                 The Wizard of Oz (1939)     1
1                        Star Wars (1977)     2
2            The Birth of a Nation (1915)     3
3                           Psycho (1960)     4
4                        King Kong (1933)     6
5  Snow White and the Seven Dwarfs (1937)     8
6            2001: A Space Odyssey (1968)     9
7                     Frankenstein (1931)    10
8                          Dracula (1931)    11
9                     Citizen Kane (1941)    12

## Country:  UK ##
                                               title  rank
0                       2001: A Space Odyssey (1968)     9
1                              The Terminator (1984)    23
2                          A Clockwork Orange (1971)    27
3                                 The Shining (1980)    31
4                

### Basic Network Statistics

In [None]:
# print basic analysis on network of links between movies
def analyse_network(input_file):
    df = pd.read_csv(input_file)
    #G = nx.from_pandas_dataframe(df, df.columns[0], df.columns[1], create_using=nx.DiGraph())
    G = nx.from_pandas_edgelist(df, df.columns[0], df.columns[1], create_using=nx.DiGraph())
    # number of nodes and edges
    print("#### NETWORK STATISTICS ####")
    print(G.number_of_nodes(), "nodes")
    print(G.number_of_edges(), "edges")
    # average degree
    degrees = G.degree()
    sum_of_edges = sum(dict(degrees).values())
    average_degree = sum_of_edges/G.number_of_nodes()
    print(round(average_degree,2), "average degree")
    print()
    # largest weakly connected component
    weak_cc = [len(c) for c in sorted(nx.weakly_connected_components(G), key=len, reverse=True)]
    print(len(weak_cc), "weakly connected components")
    print(weak_cc[0], "nodes (","{:.2f}".format((weak_cc[0]/float(G.number_of_nodes())*100)),"%) on largest weakly connected component")
    print()
    # average clustering coefficient
    print(round(nx.average_clustering(G),2), "average clustering coefficient (directed version)")
    H = G.to_undirected()
    print(round(nx.average_clustering(H),2), "average clustering coefficient (undirected version)")
    return G

In [None]:
G = analyse_network(data_directory+"/links.csv")

#### NETWORK STATISTICS ####
48005 nodes
131566 edges
5.48 average degree

4455 weakly connected components
36988 nodes ( 77.05 %) on largest weakly connected component

0.07 average clustering coefficient (directed version)
0.15 average clustering coefficient (undirected version)


# Degrees of Separation Analysis

In [None]:
print_movie_rank(data_directory, "pagerank", centrality_col = 2, by = "pagerank")
top_rank = 20
print("Most influential "+str(top_rank)+" movies of all time by in_degree centrality")
top_20 = get_rank_sorted(movie_title, movie_centralities, movie_year, movie_genres, movie_countries, centrality_col = 1, by = "in_degree")[:top_rank].merge(movie_countries.groupby("id")["country"].apply(list).reset_index())
top_20

Most influential 20 movies of all time by in_degree centrality


Unnamed: 0,title,in_degree,id,rank,country
0,Star Wars (1977),1.0,37546,2,[USA]
1,The Wizard of Oz (1939),0.837423,45045,1,[USA]
2,Psycho (1960),0.574642,32446,4,[USA]
3,The Godfather (1972),0.472393,41346,16,[USA]
4,Jaws (1975),0.447853,19552,15,[USA]
5,Casablanca (1942),0.411043,6659,13,[USA]
6,2001: A Space Odyssey (1968),0.383436,274,9,"[UK, USA]"
7,Gone with the Wind (1939),0.377301,15452,17,[USA]
8,The Shining (1980),0.377301,44078,31,"[UK, USA]"
9,Taxi Driver (1976),0.354806,38934,22,[USA]


In [None]:
title_dict = dict()
id_dict = dict()

for id, title in zip(movie_title["id"], movie_title["title"]):
  title_dict[id] = title
  id_dict[title] = id

In [None]:
top_id = []
for x in top_20["title"]:
  top_id.append(id_dict[x])

In [None]:
def degrees_separation(G, top_id):
  list_short_len = []
  cnt = 1
  no_path = 0
  for x in G.nodes:

    shortest_path = 100
    path = False
    for y in top_id:
      if nx.has_path(G, x, y):
        len = nx.shortest_path_length(G, x, y, method="dijkstra")
        if len < shortest_path:
          shortest_path = len
          path = True

    if path:
      list_short_len.append(shortest_path)
    else:
      no_path += 1

    cnt += 1

  return (max(list_short_len), no_path)

In [None]:
degrees_separation(G, top_id)

(9, 31532)

## Long term Citation Count

In [None]:
def long_term_citation(link_df, year_df, title_df, year_diff, k):
  movie_df = movie_links.merge(movie_year, on="id")
  movie_df = movie_df.rename(columns={'year': 'year_id'})
  movie_df = movie_df.rename(columns={'id': 'id_referencing', 'cites':'id'})
  movie_df_v2 = movie_df.merge(movie_year, on='id')
  movie_df_v2 = movie_df_v2.rename(columns={'year': 'year_referenced','year_id': 'year_referencing'})
  movie_df_v2['year_difference'] = movie_df_v2['year_referencing'] - movie_df_v2['year_referenced']
  movie_df_v3 =  movie_df_v2[movie_df_v2['year_difference'] >= year_diff]
  movie_df_v4 = movie_df_v3.merge(movie_title, on='id')
  top_k_movies = movie_df_v4["title"].value_counts().head(k)
  df = pd.DataFrame({'movies':top_k_movies.index, 'long-term citation count':top_k_movies.values})
  return df

In [None]:
df = long_term_citation(movie_links, movie_year, movie_title, 25, 20)
df

Unnamed: 0,movies,long-term citation count
0,The Wizard of Oz (1939),809
1,Star Wars (1977),601
2,Psycho (1960),439
3,Casablanca (1942),406
4,Gone with the Wind (1939),350
5,King Kong (1933),321
6,The Godfather (1972),320
7,Citizen Kane (1941),314
8,Frankenstein (1931),304
9,2001: A Space Odyssey (1968),275


# Multiplex Analysis

In [None]:
features = movie_links[movie_links['types']=="features"][["id", "cites"]]
references = movie_links[movie_links['types']=="references"][["id", "cites"]]
follows = movie_links[movie_links['types']=="follows"][["id", "cites"]]
spoofs = movie_links[movie_links['types']=="spoofs"][["id", "cites"]]
remake_of = movie_links[movie_links['types']=="remake of"][["id", "cites"]]
spin_of_from = movie_links[movie_links['types']=="spin off from"][["id", "cites"]]

links = [features, references, follows, spoofs, remake_of, spin_of_from]

In [None]:
print(f'Number of features: {len(features)}')
print(f'Number of references: {len(references)}')
print(f'Number of sequels: {len(follows)}')
print(f'Number of spoofs: {len(spoofs)}')
print(f'Number of remakes: {len(remake_of)}')
print(f'Number of spin-offs: {len(spin_of_from)}')

In [None]:
title_dict = dict()
id_dict = dict()

for id, title in zip(movie_title["id"], movie_title["title"]):
  title_dict[id] = title
  id_dict[title] = id

In [None]:
# Creating the graphs
nodes = movie_title["id"]

graphs = []
for type_link in links:
  edges = []
  for x, y in zip(type_link["id"], type_link["cites"]):
      edges.append((x, y))


  # Graph Creation
  G = nx.DiGraph() # Directed graph
  G.add_nodes_from(nodes)
  G.add_edges_from(edges)

  print(G)

  graphs.append(G)

print(graphs)

In [None]:
def long_term_citation(link_df, year_df, title_df, year_diff, k):
  movie_df = link_df.merge(movie_year, on="id")
  movie_df = movie_df.rename(columns={'year': 'year_id'})
  movie_df = movie_df.rename(columns={'id': 'id_referencing', 'cites':'id'})
  movie_df_v2 = movie_df.merge(movie_year, on='id')
  movie_df_v2 = movie_df_v2.rename(columns={'year': 'year_referenced','year_id': 'year_referencing'})
  movie_df_v2['year_difference'] = movie_df_v2['year_referencing'] - movie_df_v2['year_referenced']
  movie_df_v3 =  movie_df_v2[movie_df_v2['year_difference'] >= year_diff]
  movie_df_v4 = movie_df_v3.merge(movie_title, on='id')
  top_k_movies = movie_df_v4["title"].value_counts()#.head(k)
  mean = top_k_movies.values.mean()
  std = top_k_movies.values.std()
  values = (top_k_movies - mean) / std
  df = pd.DataFrame({'movies':top_k_movies.index, 'long-term citation count':values})
  return df

In [None]:
long_term_layers = []
for i in range(len(links)):
   long_term = long_term_citation(links[i], movie_year, movie_title, 25, 20)
   long_term_layers.append(long_term)

In [None]:
concatenated_df = pd.concat(long_term_layers)
sum_df = concatenated_df.groupby('movies').mean()

top100 = sum_df.sort_values('long-term citation count', ascending=False)[0:100]
top100

In [None]:
ranks_layers = []
for i in range(len(graphs)):
  pagerank = nx.pagerank(graphs[i])
  ranks_layers.append(pagerank)

In [None]:
df = long_term_citation(movie_links, movie_year, movie_title, 25, 20)
df

In [None]:
page_rank_vector = {}
for k in ranks_layers[0].keys():
  page_rank_vector[k] = tuple(d[k] for d in ranks_layers)

print(page_rank_vector[2])

In [None]:
long_term_vector = {}
for k in long_term_layers[0].keys():
  long_term_vector[k] = tuple(d[k] for d in long_term_layers)

print(long_term_vector)