# Exploratory Data Analysis on Global Movies (1950-2020)

### 1. Understand the data set 

In [1]:
import pandas as pd

In [71]:
# Define the file path
file_path = "../data/imdb_top_movies.csv"
# Read the CSV file into a DataFrame
movies_df = pd.read_csv(file_path)  

In [72]:
# Display the first few rows of the DataFrame
print(movies_df.head())

                         Title  Year      Rating  \
0  1. The Shawshank Redemption  1994    9.3 (3M)   
1             2. The Godfather  1972  9.2 (2.1M)   
2           3. The Dark Knight  2008    9.0 (3M)   
3     4. The Godfather Part II  1974  9.0 (1.4M)   
4              5. 12 Angry Men  1957  9.0 (917K)   

                                               Genre  \
0            Epic, Period Drama, Prison Drama, Drama   
1              Epic, Gangster, Tragedy, Crime, Drama   
2  Action Epic, Epic, Superhero, Tragedy, Action,...   
3              Epic, Gangster, Tragedy, Crime, Drama   
4     Legal Drama, Psychological Drama, Crime, Drama   

                                         Director(s)  \
0  Jack Lawrence, Frank Darabont, Bob Gunton, Han...   
1  Marlon Brando, Kay Adams, Al Pacino, Francis F...   
2  Aaron Eckhart, Christopher Nolan, Jack Nichols...   
3  Al Pacino, Robert Duvall, Robert De Niro, Fran...   
4  Martin Balsam, E.G. Marshall, Juror #10, Lee J...   

             

In [73]:
# Check the shape of the dataset
movies_df.shape

(250, 7)

In [74]:
# View column names
movies_df.columns

Index(['Title', 'Year', 'Rating', 'Genre', 'Director(s)', 'Box Office Revenue',
       'Lead Actors'],
      dtype='object')

In [75]:
# Check for missing values
movies_df.isnull().sum()

Title                 0
Year                  0
Rating                0
Genre                 0
Director(s)           0
Box Office Revenue    0
Lead Actors           0
dtype: int64

In [76]:
# Summary statistics
movies_df.describe()

Unnamed: 0,Year
count,250.0
mean,1988.432
std,25.601153
min,1921.0
25%,1972.25
50%,1995.0
75%,2009.0
max,2024.0


In [77]:
# Check for duplicates
movies_df.duplicated().sum()

np.int64(0)

In [78]:
# Rename the columns
movies_df.columns = ['title', 'year', 'rating', 'genre', "directors", "revenue", "lead_actors"]
print(movies_df.head())

                         title  year      rating  \
0  1. The Shawshank Redemption  1994    9.3 (3M)   
1             2. The Godfather  1972  9.2 (2.1M)   
2           3. The Dark Knight  2008    9.0 (3M)   
3     4. The Godfather Part II  1974  9.0 (1.4M)   
4              5. 12 Angry Men  1957  9.0 (917K)   

                                               genre  \
0            Epic, Period Drama, Prison Drama, Drama   
1              Epic, Gangster, Tragedy, Crime, Drama   
2  Action Epic, Epic, Superhero, Tragedy, Action,...   
3              Epic, Gangster, Tragedy, Crime, Drama   
4     Legal Drama, Psychological Drama, Crime, Drama   

                                           directors  \
0  Jack Lawrence, Frank Darabont, Bob Gunton, Han...   
1  Marlon Brando, Kay Adams, Al Pacino, Francis F...   
2  Aaron Eckhart, Christopher Nolan, Jack Nichols...   
3  Al Pacino, Robert Duvall, Robert De Niro, Fran...   
4  Martin Balsam, E.G. Marshall, Juror #10, Lee J...   

             

In [79]:
#Divide the rating column into two columns with rating and votes
movies_df[['rating', 'votes']] = movies_df['rating'].str.split("(", expand=True)
print(movies_df.head())

                         title  year rating  \
0  1. The Shawshank Redemption  1994   9.3    
1             2. The Godfather  1972   9.2    
2           3. The Dark Knight  2008   9.0    
3     4. The Godfather Part II  1974   9.0    
4              5. 12 Angry Men  1957   9.0    

                                               genre  \
0            Epic, Period Drama, Prison Drama, Drama   
1              Epic, Gangster, Tragedy, Crime, Drama   
2  Action Epic, Epic, Superhero, Tragedy, Action,...   
3              Epic, Gangster, Tragedy, Crime, Drama   
4     Legal Drama, Psychological Drama, Crime, Drama   

                                           directors  \
0  Jack Lawrence, Frank Darabont, Bob Gunton, Han...   
1  Marlon Brando, Kay Adams, Al Pacino, Francis F...   
2  Aaron Eckhart, Christopher Nolan, Jack Nichols...   
3  Al Pacino, Robert Duvall, Robert De Niro, Fran...   
4  Martin Balsam, E.G. Marshall, Juror #10, Lee J...   

                         revenue           

In [80]:
# Remove the parentheses from the votes column
movies_df['votes'] = movies_df['votes'].str.replace(")", "")
print(movies_df.head())

                         title  year rating  \
0  1. The Shawshank Redemption  1994   9.3    
1             2. The Godfather  1972   9.2    
2           3. The Dark Knight  2008   9.0    
3     4. The Godfather Part II  1974   9.0    
4              5. 12 Angry Men  1957   9.0    

                                               genre  \
0            Epic, Period Drama, Prison Drama, Drama   
1              Epic, Gangster, Tragedy, Crime, Drama   
2  Action Epic, Epic, Superhero, Tragedy, Action,...   
3              Epic, Gangster, Tragedy, Crime, Drama   
4     Legal Drama, Psychological Drama, Crime, Drama   

                                           directors  \
0  Jack Lawrence, Frank Darabont, Bob Gunton, Han...   
1  Marlon Brando, Kay Adams, Al Pacino, Francis F...   
2  Aaron Eckhart, Christopher Nolan, Jack Nichols...   
3  Al Pacino, Robert Duvall, Robert De Niro, Fran...   
4  Martin Balsam, E.G. Marshall, Juror #10, Lee J...   

                         revenue           

In [81]:
movies_df.dtypes

title          object
year            int64
rating         object
genre          object
directors      object
revenue        object
lead_actors    object
votes          object
dtype: object

In [82]:
# Convert the rating column to float
movies_df['rating'] = movies_df['rating'].astype(float)
movies_df.dtypes

title           object
year             int64
rating         float64
genre           object
directors       object
revenue         object
lead_actors     object
votes           object
dtype: object

In [54]:
# Replace the "Unknown" values in the revenue column with 0
movies_df['revenue'] = movies_df['revenue'].replace('Unknown', 0)

In [83]:
print(movies_df['revenue'].head())

0       Gross worldwide$29,332,133
1      Gross worldwide$250,342,198
2    Gross worldwide$1,009,057,329
3       Gross worldwide$47,964,222
4            Gross worldwide$2,945
Name: revenue, dtype: object


In [84]:
# Remove non-numeric characters and keep only the numeric figure
movies_df['revenue'] = (
    movies_df['revenue']
    .str.replace('Gross worldwide', '', regex=False)  # Remove the 'Gross worldwide' text
    .str.replace('[\$,]', '', regex=True)             # Remove dollar signs and commas
    .replace('Unknown', '0')                          # Replace 'Unknown' with '0'
)

# Convert to numeric, setting invalid values to NaN
movies_df['revenue'] = pd.to_numeric(movies_df['revenue'], errors='coerce')

# Display the cleaned column
print(movies_df[['revenue']])

        revenue
0      29332133
1     250342198
2    1009057329
3      47964222
4          2945
..          ...
245    30819442
246   221802186
247    20908467
248      113328
249           0

[250 rows x 1 columns]


  .str.replace('[\$,]', '', regex=True)             # Remove dollar signs and commas


In [88]:
movies_df.head(10)

Unnamed: 0,title,year,rating,genre,directors,revenue,lead_actors,votes
0,1. The Shawshank Redemption,1994,9.3,"Epic, Period Drama, Prison Drama, Drama","Jack Lawrence, Frank Darabont, Bob Gunton, Han...",29332133,"Bob Gunton, Tim Robbins, Morgan Freeman",3000000
1,2. The Godfather,1972,9.2,"Epic, Gangster, Tragedy, Crime, Drama","Marlon Brando, Kay Adams, Al Pacino, Francis F...",250342198,"James Caan, Marlon Brando, Al Pacino",2100000
2,3. The Dark Knight,2008,9.0,"Action Epic, Epic, Superhero, Tragedy, Action,...","Aaron Eckhart, Christopher Nolan, Jack Nichols...",1009057329,"Aaron Eckhart, Heath Ledger, Christian Bale",3000000
3,4. The Godfather Part II,1974,9.0,"Epic, Gangster, Tragedy, Crime, Drama","Al Pacino, Robert Duvall, Robert De Niro, Fran...",47964222,"Robert Duvall, Robert De Niro, Al Pacino",1400000
4,5. 12 Angry Men,1957,9.0,"Legal Drama, Psychological Drama, Crime, Drama","Martin Balsam, E.G. Marshall, Juror #10, Lee J...",2945,"Lee J. Cobb, Martin Balsam, Henry Fonda",917000
5,6. The Lord of the Rings: The Return of the King,2003,9.0,"Action Epic, Adventure Epic, Epic, Fantasy Epi...","John Rhys-Davies, Fran Walsh, J.R.R. Tolkien, ...",1138267561,"Viggo Mortensen, Ian McKellen, Elijah Wood",2100000
6,7. Schindler's List,1993,9.0,"Docudrama, Epic, Historical Epic, Period Drama...","Steven Spielberg, Ben Kingsley, Mimi Thoma, St...",322161245,"Liam Neeson, Ralph Fiennes, Ben Kingsley",1500000
7,8. Pulp Fiction,1994,8.9,"Dark Comedy, Drug Crime, Gangster, Crime, Drama","Vincent, Quentin Tarantino, Samuel L. Jackson,...",213928762,"Uma Thurman, Samuel L. Jackson, John Travolta",2300000
8,9. The Lord of the Rings: The Fellowship of th...,2001,8.9,"Action Epic, Adventure Epic, Epic, Fantasy Epi...","Janet Roddick, Fran Walsh, Orlando Bloom, Gand...",888171906,"Orlando Bloom, Ian McKellen, Elijah Wood",2100000
9,"10. The Good, the Bad and the Ugly",1966,8.8,"Adventure Epic, Dark Comedy, Desert Adventure,...","Blondie, Lee Van Cleef, Clint Eastwood, Lucian...",25264999,"Eli Wallach, Lee Van Cleef, Clint Eastwood",847000


In [86]:
movies_df.dtypes

title           object
year             int64
rating         float64
genre           object
directors       object
revenue          int64
lead_actors     object
votes           object
dtype: object

In [87]:
# Clean the votes column
movies_df['votes'] = (
    movies_df['votes']
    .str.replace('M', '*1e6')  # Replace 'M' with '*1e6'
    .str.replace('K', '*1e3')  # Replace 'K' with '*1e3'
    .map(pd.eval)             # Evaluate the string as a numeric expression
    .astype(int)              # Convert to integer
)

# Display the cleaned column
print(movies_df[['votes']].head())

     votes
0  3000000
1  2100000
2  3000000
3  1400000
4   917000


In [89]:
movies_df.head(10)

Unnamed: 0,title,year,rating,genre,directors,revenue,lead_actors,votes
0,1. The Shawshank Redemption,1994,9.3,"Epic, Period Drama, Prison Drama, Drama","Jack Lawrence, Frank Darabont, Bob Gunton, Han...",29332133,"Bob Gunton, Tim Robbins, Morgan Freeman",3000000
1,2. The Godfather,1972,9.2,"Epic, Gangster, Tragedy, Crime, Drama","Marlon Brando, Kay Adams, Al Pacino, Francis F...",250342198,"James Caan, Marlon Brando, Al Pacino",2100000
2,3. The Dark Knight,2008,9.0,"Action Epic, Epic, Superhero, Tragedy, Action,...","Aaron Eckhart, Christopher Nolan, Jack Nichols...",1009057329,"Aaron Eckhart, Heath Ledger, Christian Bale",3000000
3,4. The Godfather Part II,1974,9.0,"Epic, Gangster, Tragedy, Crime, Drama","Al Pacino, Robert Duvall, Robert De Niro, Fran...",47964222,"Robert Duvall, Robert De Niro, Al Pacino",1400000
4,5. 12 Angry Men,1957,9.0,"Legal Drama, Psychological Drama, Crime, Drama","Martin Balsam, E.G. Marshall, Juror #10, Lee J...",2945,"Lee J. Cobb, Martin Balsam, Henry Fonda",917000
5,6. The Lord of the Rings: The Return of the King,2003,9.0,"Action Epic, Adventure Epic, Epic, Fantasy Epi...","John Rhys-Davies, Fran Walsh, J.R.R. Tolkien, ...",1138267561,"Viggo Mortensen, Ian McKellen, Elijah Wood",2100000
6,7. Schindler's List,1993,9.0,"Docudrama, Epic, Historical Epic, Period Drama...","Steven Spielberg, Ben Kingsley, Mimi Thoma, St...",322161245,"Liam Neeson, Ralph Fiennes, Ben Kingsley",1500000
7,8. Pulp Fiction,1994,8.9,"Dark Comedy, Drug Crime, Gangster, Crime, Drama","Vincent, Quentin Tarantino, Samuel L. Jackson,...",213928762,"Uma Thurman, Samuel L. Jackson, John Travolta",2300000
8,9. The Lord of the Rings: The Fellowship of th...,2001,8.9,"Action Epic, Adventure Epic, Epic, Fantasy Epi...","Janet Roddick, Fran Walsh, Orlando Bloom, Gand...",888171906,"Orlando Bloom, Ian McKellen, Elijah Wood",2100000
9,"10. The Good, the Bad and the Ugly",1966,8.8,"Adventure Epic, Dark Comedy, Desert Adventure,...","Blondie, Lee Van Cleef, Clint Eastwood, Lucian...",25264999,"Eli Wallach, Lee Van Cleef, Clint Eastwood",847000


In [90]:
movies_df.dtypes

title           object
year             int64
rating         float64
genre           object
directors       object
revenue          int64
lead_actors     object
votes            int64
dtype: object

In [91]:
# Save the cleaned data to a new CSV file
cleaned_file_path = "../data/imdb_top_movies_cleaned.csv"
movies_df.to_csv(cleaned_file_path, index=False)

In [92]:
# Define the file path
file_path = "../data/imdb_top_movies_cleaned.csv"
# Read the CSV file into a DataFrame
movies_cleaned_df = pd.read_csv(file_path)  
movies_cleaned_df.head()

Unnamed: 0,title,year,rating,genre,directors,revenue,lead_actors,votes
0,1. The Shawshank Redemption,1994,9.3,"Epic, Period Drama, Prison Drama, Drama","Jack Lawrence, Frank Darabont, Bob Gunton, Han...",29332133,"Bob Gunton, Tim Robbins, Morgan Freeman",3000000
1,2. The Godfather,1972,9.2,"Epic, Gangster, Tragedy, Crime, Drama","Marlon Brando, Kay Adams, Al Pacino, Francis F...",250342198,"James Caan, Marlon Brando, Al Pacino",2100000
2,3. The Dark Knight,2008,9.0,"Action Epic, Epic, Superhero, Tragedy, Action,...","Aaron Eckhart, Christopher Nolan, Jack Nichols...",1009057329,"Aaron Eckhart, Heath Ledger, Christian Bale",3000000
3,4. The Godfather Part II,1974,9.0,"Epic, Gangster, Tragedy, Crime, Drama","Al Pacino, Robert Duvall, Robert De Niro, Fran...",47964222,"Robert Duvall, Robert De Niro, Al Pacino",1400000
4,5. 12 Angry Men,1957,9.0,"Legal Drama, Psychological Drama, Crime, Drama","Martin Balsam, E.G. Marshall, Juror #10, Lee J...",2945,"Lee J. Cobb, Martin Balsam, Henry Fonda",917000
