In [None]:
import numpy as np
import pandas as pd

In [None]:
# Download the data
!rm *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

total 1050404
drwxr-xr-x 1 root root      4096 Jul 15 04:25 .
drwxr-xr-x 1 root root      4096 Jul 15 03:15 ..
drwxr-xr-x 4 root root      4096 Jul 13 13:42 .config
-rw-r--r-- 1 root root   5424123 Jul 15 03:21 movies.csv
-rw-r--r-- 1 root root 230369854 Jul 13 13:21 name.basics.tsv.gz
drwxr-xr-x 1 root root      4096 Jul 13 13:43 sample_data
-rw-r--r-- 1 root root 276711383 Jul 13 13:21 title.akas.tsv.gz
-rw-r--r-- 1 root root 158544953 Jul 14 13:21 title.basics.tsv.gz
-rw-r--r-- 1 root root 404534101 Jul 14 13:21 title.principals.tsv.gz


In [None]:
# Load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t',na_values="\\N", low_memory=True,dtype={ 'tconst':'str', 'titleType':"str" ,'startYear':'Int64'}, usecols=['tconst','titleType', 'startYear']).set_index('tconst')
title = title.loc[(title.startYear >= 2005) & (title.titleType == "movie")]
title.to_csv("movies.csv")

In [None]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t', usecols=["tconst", "nconst","category"])
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%

cast = cast[cast.category.isin({'actor', 'actress'})]
del cast['category']
cast.head()


Unnamed: 0,tconst,nconst
11,tt0000005,nm0443482
12,tt0000005,nm0653042
16,tt0000007,nm0179163
17,tt0000007,nm0183947
21,tt0000008,nm0653028


In [None]:
# Load the name data along with birth year
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N',usecols=['nconst','primaryName', 'birthYear']).set_index('nconst')

In [None]:
name.head()

Unnamed: 0_level_0,primaryName,birthYear
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm0000001,Fred Astaire,1899.0
nm0000002,Lauren Bacall,1924.0
nm0000003,Brigitte Bardot,1934.0
nm0000004,John Belushi,1949.0
nm0000005,Ingmar Bergman,1918.0


In [None]:
from scipy.sparse import csr_matrix

In [None]:
def get_pairs(lang=None, min_acted=3, min_pairings=1):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(movies.tconst)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [None]:
pairs, cat = get_pairs(lang='IN', min_acted=1, min_pairings=3)

In [None]:
cat.shape

(7038, 2)

In [None]:
pairs

Unnamed: 0,row,col,n
0,494,5,3
1,199,5,4
2,2696,5,5
3,89,5,7
4,22,5,4
...,...,...,...
403,7030,6955,12
404,1488,6955,18
405,2665,6958,3
406,6955,7030,12


In [None]:
ForKumu = lookup(pairs, cat)
ForKumu

Unnamed: 0,count,name1,year1,name2,year2
251,18,Uttar Kumar,1973.0,Kavita Joshi,
404,18,Kavita Joshi,,Uttar Kumar,1973.0
399,12,Dev Sharma,,Uttar Kumar,1973.0
240,12,Uttar Kumar,1973.0,Dev Sharma,
406,12,Uttar Kumar,1973.0,Raju Maan,
...,...,...,...,...,...
91,3,Anil Nagrath,,Ramesh Goyal,
233,3,John Abraham,1972.0,Lara Dutta,1978.0
90,3,Vinod Tripathi,,Ramesh Goyal,
235,3,Monu Dhankad,,Vikas Balian,


In [None]:
ForKumu = ForKumu[['name1', 'name2', 'count']]
ForKumu = ForKumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})
ForKumu

Unnamed: 0,From,To,Strength
251,Uttar Kumar,Kavita Joshi,18
404,Kavita Joshi,Uttar Kumar,18
399,Dev Sharma,Uttar Kumar,12
240,Uttar Kumar,Dev Sharma,12
406,Uttar Kumar,Raju Maan,12
...,...,...,...
91,Anil Nagrath,Ramesh Goyal,3
233,John Abraham,Lara Dutta,3
90,Vinod Tripathi,Ramesh Goyal,3
235,Monu Dhankad,Vikas Balian,3


In [None]:
ForKumu.to_excel("pairs.xlsx", index = False)

In [None]:
movies = pd.read_csv('title.basics.tsv.gz',sep="\t",na_values="\\N",dtype={
    "tconst":"str",
    'titleType':'str',
    'originalTitle':'str',
    'startYear':'Int64'
},usecols=["tconst", "titleType", "originalTitle", "startYear"])

Unnamed: 0,tconst,titleType,originalTitle,startYear
0,tt0000001,short,Carmencita,1894
1,tt0000002,short,Le clown et ses chiens,1892
2,tt0000003,short,Pauvre Pierrot,1892
3,tt0000004,short,Un bon bock,1892
4,tt0000005,short,Blacksmith Scene,1893


In [None]:
indian_titles = pd.read_excel("/content/India_movies_cleaned.xlsx")

In [None]:
movies = movies[movies.tconst.isin(indian_titles.D) & (movies.startYear > 2004) & (movies.titleType=="movie") ]

In [None]:
movies.shape

(3747, 4)