In [1]:
!pip install scikit-network
import pandas as pd
import numpy as np
import sknetwork.clustering
import sknetwork.utils
from scipy.sparse import csr_matrix

In [2]:
# Download the data
!rm -f *.tsv.gz
!wget  https://datasets.imdbws.com/name.basics.tsv.gz
!wget  https://datasets.imdbws.com/title.principals.tsv.gz
!wget  https://datasets.imdbws.com/title.basics.tsv.gz
!wget  https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

In [3]:
# Load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=False).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]

# remove "\\N" from "startYear"
title = title[~title.startYear.str.contains(r"\\N")]

# change the dtype of startYear to int
title.startYear = title.startYear.astype('int')

# consider only the movies released in 2005 or later
title = title[title.startYear >= 2005]

title.head()

In [4]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t')[['tconst', 'nconst', 'category']]
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
cast = cast[cast.category.isin({'actor', 'actress'})]
cast.head()

In [5]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
cast = cast[cast['tconst'].isin(movies.index)]
# This is what the network looks like
cast.head()

In [6]:
# Explore the regions we have data for (e.g. IN, US, etc)
region = pd.read_csv('title.akas.tsv.gz', sep='\t', low_memory=False).set_index('titleId')['region']
region.value_counts().head(10)

In [7]:
# Load the name data along with birth year
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]
name_freq = cast['nconst'].value_counts()

In [8]:
def get_pairs(lang=None, min_acted=25, min_pairings=4):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [None]:
pairs_in, cat_in = get_pairs(lang='IN', min_acted=3, min_pairings=1)
ForKumu = lookup(pairs_in, cat_in)
ForKumu

In [None]:
ForKumu = ForKumu[['name1', 'name2', 'count']]
ForKumu.columns = ['From', 'To', 'Strength']
ForKumu

In [None]:
jp= ForKumu.to_csv('pairs.csv', index=None)
jp