In [31]:
import os.path
import pandas as pd


def select_plots(output_file, min_year=1990, max_rows=1000, seed=42,
                countries=('American', 'British'),
                genres=('drama', 'comedy', 'romance', 'action', 'romantic comedy', 'animation', 'crime drama', 'fantasy', 'science fiction')
                ):
    folder = 'raw_data'
    file = 'wiki_movie_plots_deduped.csv'
    csv_path = os.path.join(folder, file)
    
    df = pd.read_csv(csv_path)
    
    df = df[df['Release Year'] > min_year]
    df = df[df['Director'] != 'Unknown']
    
    df = df[df['Genre'].isin(genres)]
    df = df[df['Origin/Ethnicity'].isin(countries)]
    
    # count director -> film count
    film_count = df.groupby(['Director']).count().Title.to_dict()
    # map director -> film count
    df['movies'] = df['Director'].map(film_count)
    
    df = df.sort_values('movies', ascending=False)[:max_rows]
    
    # for compatibility with current code
    df = df.rename(columns={
        'Title': 'title',
        'Director': 'author',
        'Wiki Page': 'link',
        'Plot': 'content'
    })

    df.to_csv(output_file, header=True, index=False)
    print('articles saved to', output_file)
    return df

In [32]:
select_plots('data/plots.csv')

articles saved to data/plots.csv


Unnamed: 0,Release Year,title,Origin/Ethnicity,author,Cast,Genre,link,content,movies
14716,2005,Match Point,American,Woody Allen,"Jonathan Rhys Meyers, Scarlett Johansson, Emil...",crime drama,https://en.wikipedia.org/wiki/Match_Point,"Chris Wilton, a recently retired tennis profes...",18
11811,1992,Husbands and Wives,American,Woody Allen,"Woody Allen, Mia Farrow, Judy Davis, Sydney Po...",drama,https://en.wikipedia.org/wiki/Husbands_and_Wives,The film is about two couples: Jack (Pollack) ...,18
12030,1993,Manhattan Murder Mystery,American,Woody Allen,"Diane Keaton, Anjelica Huston, Alan Alda, Wood...",comedy,https://en.wikipedia.org/wiki/Manhattan_Murder...,Larry Lipton (Woody Allen) and his wife Carol ...,18
14068,2002,Hollywood Ending,American,Woody Allen,"Woody Allen, George Hamilton, Téa Leoni, Debra...",comedy,https://en.wikipedia.org/wiki/Hollywood_Ending,Val Waxman is a once prestigious film director...,18
12502,1995,Mighty Aphrodite,American,Woody Allen,"Woody Allen, Mira Sorvino, Helena Bonham Carte...",comedy,https://en.wikipedia.org/wiki/Mighty_Aphrodite,The film opens on ancient Greek ruins where a ...,18
...,...,...,...,...,...,...,...,...,...
11962,1993,Cannibal! The Musical,American,Trey Parker,"Trey Parker, Matt Stone, Dian Bachar, Toddy Wa...",comedy,https://en.wikipedia.org/wiki/Cannibal!_The_Mu...,The film begins with a reenactment of the grue...,3
13780,2000,The Yards,American,James Gray,"Mark Wahlberg, Charlize Theron, Joaquin Phoeni...",crime drama,https://en.wikipedia.org/wiki/The_Yards,Leo Handler (Mark Wahlberg) rides the subway t...,3
14683,2005,In Her Shoes,American,Curtis Hanson,"Cameron Diaz, Toni Collette, Shirley MacLaine",comedy,https://en.wikipedia.org/wiki/In_Her_Shoes_(20...,Maggie (Cameron Diaz) and Rose Feller (Toni Co...,3
12057,1993,The Real McCoy,American,Russell Mulcahy,"Kim Basinger, Val Kilmer, Terence Stamp",crime drama,https://en.wikipedia.org/wiki/The_Real_McCoy_(...,Karen McCoy (Kim Basinger) is released from pr...,3


In [66]:
raw_df = pd.read_csv(csv_path)

In [67]:
raw_df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [47]:
folder = 'raw_data'
file = 'wiki_movie_plots_deduped.csv'
min_year = 1990
csv_path = os.path.join(folder, file)

df = pd.read_csv(csv_path)

df = df[df['Release Year'] > min_year]
df = df[df['Director'] != 'Unknown']

In [74]:
raw_df.groupby(['Genre']).count().sort_values('Title', ascending=False).nlargest(15, 'Title').index

Index(['unknown', 'drama', 'comedy', 'horror', 'action', 'thriller', 'romance',
       'western', 'crime', 'adventure', 'musical', 'crime drama',
       'romantic comedy', 'science fiction', 'film noir'],
      dtype='object', name='Genre')

In [5]:
df.groupby(['Director']).count().sort_values('Title', ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Cast,Genre,Wiki Page,Plot
Director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P. Vasu,35,35,35,35,35,35,35
Priyadarshan,32,32,32,32,32,32,32
K. S. Ravikumar,29,29,29,29,29,29,29
Ram Gopal Varma,28,28,28,28,28,28,28
David Dhawan,27,27,27,27,27,27,27
...,...,...,...,...,...,...,...
Jameson Lam,1,1,1,1,1,1,1
James Yukich,1,1,1,1,1,1,1
James Watkins,1,1,1,1,1,1,1
James Tucker,1,1,1,1,1,1,1


In [10]:
df.groupby(['Genre']).count().sort_values('Title', ascending=False).nlargest(15, 'Title')

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Director,Cast,Wiki Page,Plot
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
unknown,3218,3218,3218,3218,3071,3218,3218
drama,2320,2320,2320,2320,2275,2320,2320
comedy,1719,1719,1719,1719,1706,1719,1719
action,753,753,753,753,747,753,753
romance,650,650,650,650,647,650,650
thriller,580,580,580,580,576,580,580
horror,573,573,573,573,557,573,573
romantic comedy,255,255,255,255,251,255,255
crime drama,208,208,208,208,208,208,208
science fiction,183,183,183,183,175,183,183


In [11]:
genres = ['drama', 'comedy', 'romance', 'action', 'romantic comedy', 'animation', 'crime drama', 'fantasy', 'science fiction']

In [18]:
df.groupby(['Origin/Ethnicity']).count().sort_values('Title', ascending=False).nlargest(10, 'Title')

Unnamed: 0_level_0,Release Year,Title,Director,Cast,Genre,Wiki Page,Plot
Origin/Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
American,5740,5740,5740,5683,5740,5740,5740
Tamil,1569,1569,1569,1539,1569,1569,1569
Bollywood,1432,1432,1432,1427,1432,1432,1432
British,1163,1163,1163,1137,1163,1163,1163
Malayalam,1059,1059,1059,1056,1059,1059,1059
Telugu,861,861,861,861,861,861,861
Japanese,693,693,693,536,693,693,693
Canadian,533,533,533,511,533,533,533
Chinese,456,456,456,436,456,456,456
Hong Kong,448,448,448,407,448,448,448


In [20]:
countries = ['American', 'British']
# countries = ['American', 'British', 'Japanese', 'Canadian']

In [49]:
df = df[df['Genre'].isin(genres)]

In [50]:
df = df[df['Origin/Ethnicity'].isin(countries)]

In [75]:
df[df.Genre=='animation'].sample(frac=1)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
15467,2008,Kung Fu Panda,American,"Mark Osborne, John Wayne Stevenson","Jack Black, Jackie Chan, Dustin Hoffman, Angel...",animation,https://en.wikipedia.org/wiki/Kung_Fu_Panda,"In the Valley of Peace, a land in ancient Chin..."
15334,2007,Shrek the Third,American,"Chris Miller, Raman Hui","Mike Myers, Eddie Murphy, Cameron Diaz",animation,https://en.wikipedia.org/wiki/Shrek_the_Third,Prince Charming vows that he will become King ...
13932,2001,Recess: School's Out,American,Chuck Sheetz,"Rickey D'Shon Collins, Jason Davis, Paul Willson",animation,https://en.wikipedia.org/wiki/Recess:_School%2...,The movie begins with a prologue set in a U.S ...
15310,2007,Persepolis,American,"Marjane Satrapi, Vincent Paronnaud","Chiara Mastroianni, Catherine Deneuve, Daniell...",animation,https://en.wikipedia.org/wiki/Persepolis_(film),"At an airport in France, an Iranian woman, Mar..."
14771,2005,Thru the Moebius Strip,American,"Glenn Chaika, Kelvin Lee","Andrea Miller, Michelle Ruff, Mark Hamill",animation,https://en.wikipedia.org/wiki/Thru_the_Moebius...,The story is about the coming of age of a 14-y...
...,...,...,...,...,...,...,...,...
15055,2006,A Scanner Darkly,American,Richard Linklater,"Keanu Reeves, Robert Downey, Jr., Woody Harrel...",animation,https://en.wikipedia.org/wiki/A_Scanner_Darkly...,The United States has lost the war on drugs. S...
14987,2006,Live Freaky! Die Freaky!,American,John Roecker,"Jason Schmidt, Tim Armstrong",animation,https://en.wikipedia.org/wiki/Live_Freaky!_Die...,The film starts out with a futuristic Nomad fr...
14959,2006,Ice Age: The Meltdown,American,Carlos Saldanha,"Ray Romano, John Leguizamo, Denis Leary, Queen...",animation,https://en.wikipedia.org/wiki/Ice_Age:_The_Mel...,"In the opening scene, Scrat, the saber-toothed..."
20587,1993,The Wrong Trousers,British,Nick Park,Peter Sallis,animation,https://en.wikipedia.org/wiki/The_Wrong_Trousers,Eccentric inventor Wallace (Peter Sallis) has ...


In [23]:
df.shape

(3376, 8)

In [24]:
df.groupby(['Director']).count().sort_values('Title', ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Cast,Genre,Wiki Page,Plot
Director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Woody Allen,18,18,18,18,18,18,18
Dennis Dugan,12,12,12,12,12,12,12
Michael Winterbottom,10,10,10,10,10,10,10
Clint Eastwood,10,10,10,10,10,10,10
Garry Marshall,9,9,9,9,9,9,9
...,...,...,...,...,...,...,...
James Rogers,1,1,1,1,1,1,1
James Ricardo,1,1,1,1,1,1,1
James Ponsoldt,1,1,1,1,1,1,1
James Orr,1,1,1,1,1,1,1


In [25]:
# director -> film count
film_count = df.groupby(['Director']).count().Title.to_dict()

In [26]:
# map director with film count
df['movies'] = df['Director'].map(film_count)

In [28]:
max_rows = 1000

In [29]:
df.sort_values('movies', ascending=False)[:max_rows]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,movies
14716,2005,Match Point,American,Woody Allen,"Jonathan Rhys Meyers, Scarlett Johansson, Emil...",crime drama,https://en.wikipedia.org/wiki/Match_Point,"Chris Wilton, a recently retired tennis profes...",18
11811,1992,Husbands and Wives,American,Woody Allen,"Woody Allen, Mia Farrow, Judy Davis, Sydney Po...",drama,https://en.wikipedia.org/wiki/Husbands_and_Wives,The film is about two couples: Jack (Pollack) ...,18
12030,1993,Manhattan Murder Mystery,American,Woody Allen,"Diane Keaton, Anjelica Huston, Alan Alda, Wood...",comedy,https://en.wikipedia.org/wiki/Manhattan_Murder...,Larry Lipton (Woody Allen) and his wife Carol ...,18
14068,2002,Hollywood Ending,American,Woody Allen,"Woody Allen, George Hamilton, Téa Leoni, Debra...",comedy,https://en.wikipedia.org/wiki/Hollywood_Ending,Val Waxman is a once prestigious film director...,18
12502,1995,Mighty Aphrodite,American,Woody Allen,"Woody Allen, Mira Sorvino, Helena Bonham Carte...",comedy,https://en.wikipedia.org/wiki/Mighty_Aphrodite,The film opens on ancient Greek ruins where a ...,18
...,...,...,...,...,...,...,...,...,...
11962,1993,Cannibal! The Musical,American,Trey Parker,"Trey Parker, Matt Stone, Dian Bachar, Toddy Wa...",comedy,https://en.wikipedia.org/wiki/Cannibal!_The_Mu...,The film begins with a reenactment of the grue...,3
13780,2000,The Yards,American,James Gray,"Mark Wahlberg, Charlize Theron, Joaquin Phoeni...",crime drama,https://en.wikipedia.org/wiki/The_Yards,Leo Handler (Mark Wahlberg) rides the subway t...,3
14683,2005,In Her Shoes,American,Curtis Hanson,"Cameron Diaz, Toni Collette, Shirley MacLaine",comedy,https://en.wikipedia.org/wiki/In_Her_Shoes_(20...,Maggie (Cameron Diaz) and Rose Feller (Toni Co...,3
12057,1993,The Real McCoy,American,Russell Mulcahy,"Kim Basinger, Val Kilmer, Terence Stamp",crime drama,https://en.wikipedia.org/wiki/The_Real_McCoy_(...,Karen McCoy (Kim Basinger) is released from pr...,3
