In [None]:
import pandas as pd
import networkx as nx
from tqdm import tqdm_notebook as tqdm
import itertools
import matplotlib.pyplot as plt

#### Create connection with datasets

##### Datasets via shareable link

In [None]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
link = "https://drive.google.com/file/d/1T_cW32Wtp_D-5XAFo7SScy-gO0phlHhp/view?usp=sharing"
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('title.ratings.tsv')

In [None]:
link = "https://drive.google.com/file/d/1tyqWloBbdzxFROWRtrAZAsr82HwJQT7T/view?usp=sharing"
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('title.principals.tsv')

In [None]:
link = "https://drive.google.com/file/d/1vCYRPpD6iWAX-LpSFBa0orH_0VCfqL7-/view?usp=sharing"
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('title.basics.tsv')

In [None]:
link = "https://drive.google.com/file/d/1yhwoApklCg0y_RM-gqwxy80l-rMQvCWH/view?usp=sharing"
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('name.basics.tsv')

#### Read in datasets

In [None]:
ratings = pd.read_csv("title.ratings.tsv", sep='\t')

In [None]:
crew = pd.read_csv("title.principals.tsv", sep='\t')

In [None]:
movies = pd.read_csv("title.basics.tsv", sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
names = pd.read_csv("name.basics.tsv", sep='\t')

#### Clean data

In [None]:
print(len(ratings))
print(len(crew))
print(len(movies))
print(len(names))

1149651
44619193
7884849
10916797


In [None]:
names = names[names.deathYear == '\\N'] # remove deceased crew from names data

In [None]:
movies = movies[movies.titleType == "movie"] # focus on movies

In [None]:
# Limits for years to calculate
fromYear = 2011
toYear = 2020

In [None]:
movies = movies[(movies.startYear >= fromYear) & (movies.startYear <= toYear)] # filter out all years that are not in the given range

In [None]:
crew = crew[crew.nconst.isin(names.nconst)] # remove deceased crew from crew data
movies = movies[movies.tconst.isin(crew.tconst)] # remove movies that no longer have crew data
movies = movies[movies.tconst.isin(ratings.tconst)] # remove movies that don't have ratings

In [None]:
ratings = ratings[ratings.tconst.isin(movies.tconst)] # remove ratings of non-movies
crew = crew[crew.tconst.isin(movies.tconst)] # remove crew of non-movies
names = names[names.nconst.isin(crew.nconst)] # remove names of people not in the new crew list

In [None]:
print(len(ratings))
print(len(crew))
print(len(movies))
print(len(names))

85621
718655
85621
407015


In [None]:
movies.to_csv('movie.basics.cleaned.csv')
ratings.to_csv('movie.ratings.cleaned.csv')
crew.to_csv('movie.principals.cleaned.csv')
names.to_csv('names.basics.cleaned.csv')

In [None]:
actors = crew[(crew.category=="actor") | (crew.category=="actress")] # filter out only actors and actresses from the crew
actors2 = actors[["tconst", "nconst"]] # to get columns that is needed
print("Not all movies have data about actors:")
print("Number of movies:", len(movies))
print("Number of movies with actor data:", len(actors2.tconst.unique()))
movies2 = movies[movies.tconst.isin(actors2.tconst)] # Lets remove movies without actor data
names2 = names[names.nconst.isin(actors2.nconst)] # remove names of people not in the actors list
actors_names = pd.merge(names2, actors2, how="left", on="nconst") # merge data to have names

Not all movies have data about actors:
Number of movies: 85621
Number of movies with actor data: 68921


68921

#### Make graph and graph pickle

In [None]:
# Better Try

G = nx.Graph()

for movie in tqdm(movies2.index):
  actorlist = actors_names.loc[actors_names.tconst == movies.tconst[movie]].index
  for i, j in itertools.combinations(actorlist, 2):
    # try-except if some names are not available due to partial testing data
    actor_node1 = actors_names["primaryName"][i]
    actor_node2 = actors_names["primaryName"][j]

    G.add_edge(actor_node1, actor_node2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=68921.0), HTML(value='')))




In [None]:
nx.write_gpickle(G, "graph.gpickle")