In [226]:
import os.path
import pandas as pd


def select_plots(output_file, min_year=1990, max_rows=1000, seed=42):
    folder = 'raw_data'
    file = 'wiki_movie_plots_deduped.csv'
    csv_path = os.path.join(folder, file)
    
    df = pd.read_csv(csv_path)
    
    df = df[df['Release Year'] > min_year]
    df = df[df['Director'] != 'Unknown']
    
    countries = ['American', 'British', 'Japanese', 'Canadian']
    genres = ['drama', 'comedy', 'romance', 'action', 'romantic comedy', 'comedy-drama', 'crime drama', 'thriller', 'science fiction']
    
    df = df[df['Genre'].isin(genres)]
    df = df[df['Origin/Ethnicity'].isin(countries)]
    
    # count director -> film count
    film_count = df.groupby(['Director']).count().Title.to_dict()
    # map director -> film count
    df['movies'] = df['Director'].map(film_count)
    
    df = df.sort_values('movies', ascending=False)[:max_rows]
    
    # for compatibility with current code
    df.rename(columns={
        'Title': 'title',
        'Director': 'author',
        'Wiki Page': 'link',
        'Plot': 'content'
    })

    df.to_csv(output_file, header=True, index=False)
    print('articles saved to', output_file)
    return df

In [225]:
select_plots('data/plots.csv')

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,movies
14716,2005,Match Point,American,Woody Allen,"Jonathan Rhys Meyers, Scarlett Johansson, Emil...",crime drama,https://en.wikipedia.org/wiki/Match_Point,"Chris Wilton, a recently retired tennis profes...",18
16454,2013,Blue Jasmine,American,Woody Allen,"Alec Baldwin, Cate Blanchett, Louis C.K., Bobb...",drama,https://en.wikipedia.org/wiki/Blue_Jasmine,Jasmine Francis (Cate Blanchett) disembarks in...,18
15171,2007,Cassandra's Dream,American,Woody Allen,"Hayley Atwell, Colin Farrell, Sally Hawkins, E...",drama,https://en.wikipedia.org/wiki/Cassandra%27s_Dream,Brothers Terry (Colin Farrell) and Ian (Ewan M...,18
11811,1992,Husbands and Wives,American,Woody Allen,"Woody Allen, Mia Farrow, Judy Davis, Sydney Po...",drama,https://en.wikipedia.org/wiki/Husbands_and_Wives,The film is about two couples: Jack (Pollack) ...,18
13740,2000,Small Time Crooks,American,Woody Allen,"Woody Allen, Hugh Grant, Tracey Ullman",comedy,https://en.wikipedia.org/wiki/Small_Time_Crooks,Career criminal Ray (Woody Allen) and his cron...,18
...,...,...,...,...,...,...,...,...,...
12291,1994,Renaissance Man,American,Penny Marshall,"Danny DeVito, Gregory Hines, Mark Wahlberg, Cl...",comedy,https://en.wikipedia.org/wiki/Renaissance_Man_...,Bill Rago (DeVito) is a divorced advertising e...,4
12304,1994,Serial Mom,American,John Waters,"Kathleen Turner, Sam Waterston, Ricki Lake, Ma...",comedy,https://en.wikipedia.org/wiki/Serial_Mom,Beverly Sutphin appears to be a typical suburb...,4
12305,1994,The Shadow,American,Russell Mulcahy,"Alec Baldwin, John Lone, Penelope Ann Miller",action,https://en.wikipedia.org/wiki/The_Shadow_(1994...,"In Tibet, following the First World War, an Am...",4
16298,2012,Hope Springs,American,David Frankel,"Meryl Streep, Tommy Lee Jones, Steve Carell",comedy,https://en.wikipedia.org/wiki/Hope_Springs_(20...,"Although a devoted couple, empty nesters Kay a...",4


In [217]:
folder = 'raw_data'
file = 'wiki_movie_plots_deduped.csv'
csv_path = os.path.join(folder, file)

df = pd.read_csv(csv_path)

df = df[df['Release Year'] > min_year]
df = df[df['Director'] != 'Unknown']

In [104]:
df.groupby(['Director']).count().sort_values('Title', ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Cast,Genre,Wiki Page,Plot
Director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P. Vasu,35,35,35,35,35,35,35
Priyadarshan,32,32,32,32,32,32,32
K. S. Ravikumar,29,29,29,29,29,29,29
Ram Gopal Varma,28,28,28,28,28,28,28
David Dhawan,27,27,27,27,27,27,27
...,...,...,...,...,...,...,...
Jameson Lam,1,1,1,1,1,1,1
James Yukich,1,1,1,1,1,1,1
James Watkins,1,1,1,1,1,1,1
James Tucker,1,1,1,1,1,1,1


In [105]:
df.groupby(['Genre']).count().sort_values('Title', ascending=False).nlargest(10, 'Title')

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Director,Cast,Wiki Page,Plot
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
unknown,3218,3218,3218,3218,3071,3218,3218
drama,2320,2320,2320,2320,2275,2320,2320
comedy,1719,1719,1719,1719,1706,1719,1719
action,753,753,753,753,747,753,753
romance,650,650,650,650,647,650,650
thriller,580,580,580,580,576,580,580
horror,573,573,573,573,557,573,573
romantic comedy,255,255,255,255,251,255,255
crime drama,208,208,208,208,208,208,208
science fiction,183,183,183,183,175,183,183


In [106]:
df.groupby(['Origin/Ethnicity']).count().sort_values('Title', ascending=False).nlargest(10, 'Title')

Unnamed: 0_level_0,Release Year,Title,Director,Cast,Genre,Wiki Page,Plot
Origin/Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
American,5740,5740,5740,5683,5740,5740,5740
Tamil,1569,1569,1569,1539,1569,1569,1569
Bollywood,1432,1432,1432,1427,1432,1432,1432
British,1163,1163,1163,1137,1163,1163,1163
Malayalam,1059,1059,1059,1056,1059,1059,1059
Telugu,861,861,861,861,861,861,861
Japanese,693,693,693,536,693,693,693
Canadian,533,533,533,511,533,533,533
Chinese,456,456,456,436,456,456,456
Hong Kong,448,448,448,407,448,448,448


In [124]:
countries = ['American', 'British', 'Japanese', 'Canadian']

In [125]:
genres = ['drama', 'comedy', 'romance', 'action', 'romantic comedy', 'comedy-drama', 'crime drama', 'thriller', 'science fiction']

In [126]:
df = df[df['Genre'].isin(genres)]

In [127]:
df = df[df['Origin/Ethnicity'].isin(countries)]

In [128]:
df.shape

(4021, 8)

In [193]:
df.groupby(['Director']).count().sort_values('Title', ascending=False)

Unnamed: 0_level_0,Release Year,Title,Origin/Ethnicity,Cast,Genre,Wiki Page,Plot,movies
Director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Woody Allen,18,18,18,18,18,18,18,0
Dennis Dugan,12,12,12,12,12,12,12,0
Garry Marshall,10,10,10,10,10,10,10,0
Clint Eastwood,10,10,10,10,10,10,10,0
Spike Lee,10,10,10,10,10,10,10,0
...,...,...,...,...,...,...,...,...
James Wong,1,1,1,1,1,1,1,0
James Wan,1,1,1,1,1,1,1,0
James Rogers,1,1,1,1,1,1,1,0
James Ricardo,1,1,1,1,1,1,1,0


In [194]:
# director -> film count
film_count = df.groupby(['Director']).count().Title.to_dict()

In [202]:
# map director with film count
df['movies'] = df['Director'].map(film_count)

In [212]:
max_rows = 1000

In [214]:
df.sort_values('movies', ascending=False)[:max_rows]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,movies
14716,2005,Match Point,American,Woody Allen,"Jonathan Rhys Meyers, Scarlett Johansson, Emil...",crime drama,https://en.wikipedia.org/wiki/Match_Point,"Chris Wilton, a recently retired tennis profes...",18
16454,2013,Blue Jasmine,American,Woody Allen,"Alec Baldwin, Cate Blanchett, Louis C.K., Bobb...",drama,https://en.wikipedia.org/wiki/Blue_Jasmine,Jasmine Francis (Cate Blanchett) disembarks in...,18
15171,2007,Cassandra's Dream,American,Woody Allen,"Hayley Atwell, Colin Farrell, Sally Hawkins, E...",drama,https://en.wikipedia.org/wiki/Cassandra%27s_Dream,Brothers Terry (Colin Farrell) and Ian (Ewan M...,18
11811,1992,Husbands and Wives,American,Woody Allen,"Woody Allen, Mia Farrow, Judy Davis, Sydney Po...",drama,https://en.wikipedia.org/wiki/Husbands_and_Wives,The film is about two couples: Jack (Pollack) ...,18
13740,2000,Small Time Crooks,American,Woody Allen,"Woody Allen, Hugh Grant, Tracey Ullman",comedy,https://en.wikipedia.org/wiki/Small_Time_Crooks,Career criminal Ray (Woody Allen) and his cron...,18
...,...,...,...,...,...,...,...,...,...
12291,1994,Renaissance Man,American,Penny Marshall,"Danny DeVito, Gregory Hines, Mark Wahlberg, Cl...",comedy,https://en.wikipedia.org/wiki/Renaissance_Man_...,Bill Rago (DeVito) is a divorced advertising e...,4
12304,1994,Serial Mom,American,John Waters,"Kathleen Turner, Sam Waterston, Ricki Lake, Ma...",comedy,https://en.wikipedia.org/wiki/Serial_Mom,Beverly Sutphin appears to be a typical suburb...,4
12305,1994,The Shadow,American,Russell Mulcahy,"Alec Baldwin, John Lone, Penelope Ann Miller",action,https://en.wikipedia.org/wiki/The_Shadow_(1994...,"In Tibet, following the First World War, an Am...",4
16298,2012,Hope Springs,American,David Frankel,"Meryl Streep, Tommy Lee Jones, Steve Carell",comedy,https://en.wikipedia.org/wiki/Hope_Springs_(20...,"Although a devoted couple, empty nesters Kay a...",4
