Python script manipulating publicly available IMDB name_basics dataset in order to fit COP3530 Project 3 parameters. 

Available here: https://datasets.imdbws.com/

In [83]:
#importing useful libraries
import pandas as pd
import numpy as np

In [84]:
#read imdb into dataframe 
name_basics = pd.read_csv("name.basics.tsv/data.tsv", sep='\t')
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0071877,tt0037382,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0049189,tt0054452,tt0056404,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0060827,tt0050986,tt0050976"


In [85]:
#initial shape of dataframe
print(name_basics.shape)

(11365349, 6)


In [86]:
#only keep entries with valid birth years
name_basics = name_basics[name_basics['birthYear'] != '\\N']
print(name_basics.shape)

(537394, 6)


In [87]:
#casting bday to int for future comparison
print(name_basics['birthYear'].dtypes)
name_basics['birthYear'] = name_basics['birthYear'].astype(int)
print(name_basics['birthYear'].dtypes)

object
int32


In [88]:
#filter out "ancienct" actors
name_basics = name_basics[name_basics['birthYear'] > 1899]
name_basics.shape

(498336, 6)

In [89]:
#cast popular appearances to string
print(name_basics['knownForTitles'].dtypes)
name_basics['knownForTitles'] = name_basics['knownForTitles'].astype(str)
print(name_basics['knownForTitles'].dtypes)

object
object


In [90]:
#filter out those with no notable titles
name_basics = name_basics[name_basics['knownForTitles'] != '\\N']
print(name_basics.shape)

(486449, 6)


In [91]:
#keep only those who are primarily actors/actresses
name_basics['primaryProfession'] = name_basics['primaryProfession'].str.slice(0,3)
name_basics = name_basics[name_basics['primaryProfession'] == 'act']
print(name_basics.shape)

(261105, 6)


In [92]:
#filter out actors/actresses with less than 4 notable appearances
name_basics['len'] = name_basics['knownForTitles'].str.len()
name_basics = name_basics[name_basics['len'] == np.int64(39)]
print(name_basics.shape)

(168330, 7)


In [93]:
#top of dataframe
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,len
1,nm0000002,Lauren Bacall,1924,2014,act,"tt0038355,tt0071877,tt0037382,tt0117057",39
2,nm0000003,Brigitte Bardot,1934,\N,act,"tt0049189,tt0054452,tt0056404,tt0057345",39
3,nm0000004,John Belushi,1949,1982,act,"tt0072562,tt0077975,tt0080455,tt0078723",39
5,nm0000006,Ingrid Bergman,1915,1982,act,"tt0038109,tt0077711,tt0036855,tt0034583",39
7,nm0000008,Marlon Brando,1924,2004,act,"tt0078788,tt0068646,tt0047296,tt0070849",39


In [94]:
#bottom of dataframe
name_basics.tail()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,len
11342666,nm9968037,Takashi Yamanaka,1978,\N,act,"tt2412560,tt1542840,tt3345472,tt1244666",39
11354154,nm9981009,Lynn Taylor,1938,\N,act,"tt0054518,tt0058846,tt0129685,tt0196257",39
11354254,nm9981115,Anatoliy Vasilyev,1945,\N,act,"tt0252016,tt5332324,tt8715760,tt0308680",39
11354556,nm9981451,Jessica Night,1992,\N,act,"tt9501928,tt6642458,tt6669116,tt9497336",39
11357932,nm9985332,Michael Iacono,2005,\N,act,"tt8110232,tt4158110,tt2372162,tt8591260",39


In [95]:
#drop irrelevant columns
name_basics.drop('nconst', axis=1, inplace=True)
name_basics.drop('birthYear', axis=1, inplace=True)
name_basics.drop('deathYear', axis=1, inplace=True)
name_basics.drop('primaryProfession', axis=1, inplace=True)
name_basics.drop('len', axis=1, inplace=True)

print(name_basics.shape)

(168330, 2)


In [96]:
#updated top of dataframe
name_basics.head()

Unnamed: 0,primaryName,knownForTitles
1,Lauren Bacall,"tt0038355,tt0071877,tt0037382,tt0117057"
2,Brigitte Bardot,"tt0049189,tt0054452,tt0056404,tt0057345"
3,John Belushi,"tt0072562,tt0077975,tt0080455,tt0078723"
5,Ingrid Bergman,"tt0038109,tt0077711,tt0036855,tt0034583"
7,Marlon Brando,"tt0078788,tt0068646,tt0047296,tt0070849"


In [97]:
#write updated database to csv
name_basics.to_csv('graph.csv', index=False, header=True)