Python script manipulating publicly available IMDB name_basics dataset in order to fit COP3530 Project 3 parameters. 

Available here: https://datasets.imdbws.com/

In [2]:
#importing useful libraries
import pandas as pd
import numpy as np

In [3]:
#read imdb into dataframe 
name_basics = pd.read_csv("name.basics.tsv/data.tsv", sep='\t')
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0071877,tt0037382,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0049189,tt0054452,tt0056404,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0060827,tt0050986,tt0050976"


In [4]:
#initial shape of dataframe
print(name_basics.shape)

(11365349, 6)


In [5]:
#only keep entries with valid birth years
name_basics = name_basics[name_basics['birthYear'] != '\\N']
print(name_basics.shape)

(537394, 6)


In [6]:
#casting bday to int for future comparison
print(name_basics['birthYear'].dtypes)
name_basics['birthYear'] = name_basics['birthYear'].astype(int)
print(name_basics['birthYear'].dtypes)

object
int32


In [7]:
#filter actors by age
name_basics = name_basics[name_basics['birthYear'] > 1950]
name_basics.shape

(305660, 6)

In [8]:
#cast popular appearances to string
print(name_basics['knownForTitles'].dtypes)
name_basics['knownForTitles'] = name_basics['knownForTitles'].astype(str)
print(name_basics['knownForTitles'].dtypes)

object
object


In [9]:
#filter out those with no notable titles
name_basics = name_basics[name_basics['knownForTitles'] != '\\N']
print(name_basics.shape)

(297830, 6)


In [10]:
#keep only those who are primarily actors/actresses
name_basics['primaryProfession'] = name_basics['primaryProfession'].str.slice(0,3)
name_basics = name_basics[name_basics['primaryProfession'] == 'act']
print(name_basics.shape)

(164604, 6)


In [11]:
#filter out actors/actresses with less than 4 notable appearances
name_basics['len'] = name_basics['knownForTitles'].str.len()
name_basics = name_basics[name_basics['len'] == np.int64(39)]
print(name_basics.shape)

(99095, 7)


In [12]:
#top of dataframe
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,len
28,nm0000029,Margaux Hemingway,1954,1996,act,"tt0102122,tt0074802,tt0110138,tt0077800",39
83,nm0000084,Gong Li,1965,\N,act,"tt0101640,tt0397535,tt0473444,tt0430357",39
86,nm0000087,Elena Koreneva,1953,\N,act,"tt7529350,tt0080912,tt0122969,tt5847740",39
92,nm0000093,Brad Pitt,1963,\N,act,"tt1210166,tt2935510,tt0114746,tt0356910",39
95,nm0000096,Gillian Anderson,1968,\N,act,"tt0106179,tt0442632,tt2294189,tt0455590",39


In [13]:
#bottom of dataframe
name_basics.tail()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,len
11331959,nm9956087,Kiara Cole,1999,\N,act,"tt4381512,tt5917524,tt4384468,tt5946088",39
11332770,nm9956985,Mariya Krylova,1994,\N,act,"tt5665954,tt8668488,tt3447270,tt5148210",39
11342666,nm9968037,Takashi Yamanaka,1978,\N,act,"tt2412560,tt1542840,tt3345472,tt1244666",39
11354556,nm9981451,Jessica Night,1992,\N,act,"tt9501928,tt6642458,tt6669116,tt9497336",39
11357932,nm9985332,Michael Iacono,2005,\N,act,"tt8110232,tt4158110,tt2372162,tt8591260",39


In [14]:
#filter out dead actors
name_basics = name_basics[name_basics['deathYear'] == '\\N']

In [15]:
#drop irrelevant columns
name_basics.drop('nconst', axis=1, inplace=True)
name_basics.drop('birthYear', axis=1, inplace=True)
name_basics.drop('deathYear', axis=1, inplace=True)
name_basics.drop('primaryProfession', axis=1, inplace=True)
name_basics.drop('len', axis=1, inplace=True)

print(name_basics.shape)

(93968, 2)


In [16]:
#updated top of dataframe
name_basics.head()

Unnamed: 0,primaryName,knownForTitles
83,Gong Li,"tt0101640,tt0397535,tt0473444,tt0430357"
86,Elena Koreneva,"tt7529350,tt0080912,tt0122969,tt5847740"
92,Brad Pitt,"tt1210166,tt2935510,tt0114746,tt0356910"
95,Gillian Anderson,"tt0106179,tt0442632,tt2294189,tt0455590"
96,Pamela Anderson,"tt0426592,tt0115624,tt0267913,tt0306047"


In [17]:
#write updated database to csv
name_basics.to_csv('graph.csv', index=False, header=True)