In [2]:
import pandas as pd
import numpy as np

In [3]:
# The path to our CSV file
file = "Resources/IMDb movies.csv"

# Read our Kickstarter data into pandas
movies_df = pd.read_csv(file, low_memory=False)
len(movies_df)

85855

In [4]:
# The path to our CSV file
file = "Resources/IMDb ratings.csv"

# Read our Kickstarter data into pandas
ratings_df = pd.read_csv(file)
len(ratings_df)

85855

In [5]:
merged_df = pd.merge(movies_df, ratings_df, on='imdb_title_id')
len(merged_df)

85855

In [6]:
merged_df.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'weighted_average_vote',
       'total_votes', 'mean_vote', 'median_vote', 'votes_10', 'votes_9',
       'votes_8', 'votes_7', 'votes_6', 'votes_5', 'votes_4', 'votes_3',
       'votes_2', 'votes_1', 'allgenders_0age_avg_vote',
       'allgenders_0age_votes', 'allgenders_18age_avg_vote',
       'allgenders_18age_votes', 'allgenders_30age_avg_vote',
       'allgenders_30age_votes', 'allgenders_45age_avg_vote',
       'allgenders_45age_votes', 'males_allages_avg_vote',
       'males_allages_votes', 'males_0age_avg_vote', 'males_0age_votes',
       'males_18age_avg_vote', 'males_18age_votes', 'males_30age_avg_vote',
       'males_30age_votes'

In [7]:
merged_df = merged_df[['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'weighted_average_vote',
       'total_votes', 'mean_vote', 'median_vote', 'males_allages_avg_vote',
       'males_allages_votes','females_allages_avg_vote','us_voters_rating',
       'us_voters_votes']]
merged_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,reviews_from_critics,weighted_average_vote,total_votes,mean_vote,median_vote,males_allages_avg_vote,males_allages_votes,females_allages_avg_vote,us_voters_rating,us_voters_votes
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,2.0,5.9,154,5.9,6.0,6.2,97.0,6.0,6.4,51.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,7.0,6.1,589,6.3,6.0,6.1,425.0,6.2,6.0,96.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,2.0,5.8,188,6.0,6.0,5.9,146.0,5.7,6.2,31.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,3.0,5.2,446,5.3,5.0,5.1,299.0,5.9,5.5,207.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,14.0,7.0,2237,6.9,7.0,7.0,1607.0,7.2,7.0,488.0


In [8]:
# Get only 
usa_df = merged_df[merged_df['country'] == 'USA']
len(usa_df)

28511

In [29]:
# Get only movies that were released past 2005
recent_df = usa_df[usa_df['year'].str.len() == 4]
recent_df = recent_df[recent_df['year'].astype(int) > 2005]
len(recent_df)

10476

In [30]:
# get rid of Reality TV
recent_df = recent_df[recent_df['genre'].str.contains('Reality-TV') == False]

# Fix budget string to number
recent_df['budget'] = recent_df['budget'].str.extract('(\d+)', expand=False)
recent_df['budget'] = recent_df['budget'].fillna(0)
recent_df['budget'] = recent_df['budget'].astype(int)

recent_df['lead_actor'] = recent_df['actors'].str.split(',').str[0]
recent_df['lead_director'] = recent_df['director'].str.split(',').str[0]
recent_df['lead_writer'] = recent_df['actors'].str.split(',').str[0]
recent_df['lead_actor'] = recent_df['lead_actor'].fillna(0)
recent_df['lead_director'] = recent_df['lead_director'].fillna(0)
recent_df['lead_writer'] = recent_df['lead_writer'].fillna(0)

recent_df.dropna(axis=1, inplace=True)
len(recent_df)

10475

In [54]:
recent_df.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'avg_vote', 'votes', 'budget',
       'weighted_average_vote', 'total_votes', 'mean_vote', 'median_vote',
       'females_allages_avg_vote', 'lead_actor', 'lead_director',
       'lead_writer'],
      dtype='object')

In [57]:
actor_df = pd.DataFrame(recent_df['lead_actor'].value_counts()).reset_index().reset_index()
actor_df.drop('lead_actor', axis=1, inplace=True)
actor_df.rename(columns={"level_0": "actor_number", "index": "lead_actor"}, inplace=True)
recent_df.to_csv('people_codes/actors.csv')
actor_df.head()

Unnamed: 0,actor_number,lead_actor
0,0,Eric Roberts
1,1,Adam Sandler
2,2,Nicolas Cage
3,3,Danny Trejo
4,4,James Franco


In [58]:

recent_df = pd.merge(recent_df, actor_df, on="lead_actor")
recent_df.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'avg_vote', 'votes', 'budget',
       'weighted_average_vote', 'total_votes', 'mean_vote', 'median_vote',
       'females_allages_avg_vote', 'lead_actor', 'lead_director',
       'lead_writer', 'actor_number'],
      dtype='object')

In [59]:
recent_df['original_title'].value_counts()

Home                    5
Alone                   4
Dreamland               4
Shelter                 4
Delirium                3
                       ..
Sound of My Voice       1
Ninja Cheerleaders      1
Hot Pursuit             1
A Life Not to Follow    1
Blood Brother           1
Name: original_title, Length: 10258, dtype: int64

In [60]:
test_df = recent_df[recent_df['original_title'] == 'Home']
test_df

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,avg_vote,votes,budget,weighted_average_vote,total_votes,mean_vote,median_vote,females_allages_avg_vote,lead_actor,lead_director,lead_writer,actor_number
2484,tt0822388,Home,Home,2008,2008-08-23,Drama,84,USA,5.8,175,500000,5.8,175,6.1,6.0,5.6,Reathel Bean,Mary Haverstick,Reathel Bean,1615
5766,tt2224026,Home - A casa,Home,2015,2015-03-26,"Animation, Adventure, Comedy",94,USA,6.6,90310,135000000,6.6,90310,6.9,7.0,7.1,Jim Parsons,Tim Johnson,Jim Parsons,723
7263,tt2393825,Home,Home,2013,2013-11-22,Drama,112,USA,7.0,145,0,7.0,145,7.5,8.0,7.7,Isiah Whitlock Jr.,Jono Oliver,Isiah Whitlock Jr.,2007
7448,tt2545384,Home,Home,2016,2016-03-01,"Drama, Horror, Thriller",87,USA,3.9,344,0,3.9,344,4.2,4.0,4.1,Heather Langenkamp,Frank Lin,Heather Langenkamp,5910
7512,tt2597242,Oltre il male,Home,2014,2014-08-08,Horror,91,USA,4.8,6758,0,4.8,6758,4.9,5.0,4.7,Ashley Rickards,Nicholas McCarthy,Ashley Rickards,1359


In [61]:
recent_df.to_csv('Sample/sample.csv')