In [1]:
# Install pandas 
import pandas as pd

In [2]:
# Read data from .csv file
data = pd.read_csv('IMDB-Movie-Data.csv')

# Read data with specified explicit index.
# We will use this later in our analysis
data_indexed = pd.read_csv('IMDB-Movie-Data.csv', index_col="Title")

In [3]:
# Preview top 5 rows using head()
data.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [4]:
# Preview bottom 5 rows using tail()
data.tail()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,,45.0
996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.54,46.0
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.01,50.0
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,,22.0
999,1000,Nine Lives,"Comedy,Family,Fantasy",A stuffy businessman finds himself trapped ins...,Barry Sonnenfeld,"Kevin Spacey, Jennifer Garner, Robbie Amell,Ch...",2016,87,5.3,12435,19.64,11.0


In [5]:
#Lets first understand the basic information about this data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [6]:
# shape can be used to get the shape of dataframe

data.shape



(1000, 12)

This function tells us that there are 1000 rows and 12 columns in the dataset

In [7]:
# columns gives us the list of columns in the dataframe
data.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')

In [8]:
# describe( ) method gives the basic statistical summaries of all numerical attributes in the dataframe.
data.describe()

Unnamed: 0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
count,1000.0,1000.0,1000.0,1000.0,1000.0,872.0,936.0
mean,500.5,2012.783,113.172,6.7232,169808.3,82.956376,58.985043
std,288.819436,3.205962,18.810908,0.945429,188762.6,103.25354,17.194757
min,1.0,2006.0,66.0,1.9,61.0,0.0,11.0
25%,250.75,2010.0,100.0,6.2,36309.0,13.27,47.0
50%,500.5,2014.0,111.0,6.8,110799.0,47.985,59.5
75%,750.25,2016.0,123.0,7.4,239909.8,113.715,72.0
max,1000.0,2016.0,191.0,9.0,1791916.0,936.63,100.0


Some Insights from the Description Table

- The min and max values in ‘Year’ depict the minimum and maximum release years. We can see that the dataset contains movies from 2006 to 2016.
- The average rating for the movies in this dataset is about 6.7 and the minimum rating is 1.9 and the maximum rating is 9.0
- The maximum revenue earned by a movie is 936.6 million

In [9]:
# Extract data as series
genre = data['Genre']
print(genre.head())
# This operation will retrieve all the data from the ‘Genre’ column as Series. If we want to retrieve this data as a dataframe, then indexing must be done using double square brackets as below:

# Extract data as dataframe
genre_df = data[['Genre']]
print(genre_df.head())

#If we want to extract multiple columns from the data, simply add the column names to the list.
some_cols = data[['Title','Genre','Actors','Director','Rating']]
print(some_cols.head())

0     Action,Adventure,Sci-Fi
1    Adventure,Mystery,Sci-Fi
2             Horror,Thriller
3     Animation,Comedy,Family
4    Action,Adventure,Fantasy
Name: Genre, dtype: object
                      Genre
0   Action,Adventure,Sci-Fi
1  Adventure,Mystery,Sci-Fi
2           Horror,Thriller
3   Animation,Comedy,Family
4  Action,Adventure,Fantasy
                     Title                     Genre  \
0  Guardians of the Galaxy   Action,Adventure,Sci-Fi   
1               Prometheus  Adventure,Mystery,Sci-Fi   
2                    Split           Horror,Thriller   
3                     Sing   Animation,Comedy,Family   
4            Suicide Squad  Action,Adventure,Fantasy   

                                              Actors              Director  \
0  Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...            James Gunn   
1  Noomi Rapace, Logan Marshall-Green, Michael Fa...          Ridley Scott   
2  James McAvoy, Anya Taylor-Joy, Haley Lu Richar...    M. Night Shyamalan   
3  Mat

In [10]:
# We will use the loc function to index and slice that dataframe using the specified ‘Title’.
data_indexed.loc[['Suicide Squad']][['Genre','Actors','Director','Rating','Revenue (Millions)']]

Unnamed: 0_level_0,Genre,Actors,Director,Rating,Revenue (Millions)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Suicide Squad,"Action,Adventure,Fantasy","Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer,6.2,325.02


In [11]:
# Here, iloc is used to slice data using integer indexes.
data.iloc[10:15][['Title','Rating','Revenue (Millions)']]

Unnamed: 0,Title,Rating,Revenue (Millions)
10,Fantastic Beasts and Where to Find Them,7.5,234.02
11,Hidden Figures,7.8,169.27
12,Rogue One,7.9,532.17
13,Moana,7.7,248.75
14,Colossal,6.4,2.87


In [12]:
# pick only movies that are released from 2010 to 2016, have a rating of less than 6.0 but topped in terms of revenue

data[((data['Year'] >= 2010) & (data['Year'] <= 2016))
      & (data['Rating'] < 6.0)
      & (data['Revenue (Millions)'] > data['Revenue (Millions)'].quantile(0.95))]

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
941,942,The Twilight Saga: Eclipse,"Adventure,Drama,Fantasy",As a string of mysterious killings grips Seatt...,David Slade,"Kristen Stewart, Robert Pattinson, Taylor Laut...",2010,124,4.9,192740,300.52,58.0


‘The Twilight Saga: Breaking Dawn – Part 2′ and ‘The Twilight Saga: Eclipse’ are the movies that topped in the box office, despite having lower ratings.

In [13]:
# Data can be grouped and operations can be performed on top of grouped data by using the groupby( ) method. This comes in handy when we want to apply aggregations and functions on top of grouped data.
data.groupby('Director')[['Rating']].mean().head()

Unnamed: 0_level_0,Rating
Director,Unnamed: 1_level_1
Aamir Khan,8.5
Abdellatif Kechiche,7.8
Adam Leon,6.5
Adam McKay,7.0
Adam Shankman,6.3


In [14]:
# In the above example, where we have listed the average rating for each ‘Director’, if we want to sort them from highly rated to lowest, we can perform the sorting operation.

data.groupby('Director')[['Rating']].mean().sort_values(['Rating'], ascending=False).head()

Unnamed: 0_level_0,Rating
Director,Unnamed: 1_level_1
Nitesh Tiwari,8.8
Christopher Nolan,8.68
Olivier Nakache,8.6
Makoto Shinkai,8.6
Aamir Khan,8.5


We can see that Director ‘Nitesh Tiwari’ has the highest average rating in this dataset.

In [15]:
# To check null values row-wise
data.isnull().sum()

Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

Here we know that ‘Revenue (Millions)’ and ‘Metascore’ are two columns where there are null values.

As we have seen null values in data, we can either choose to drop those or impute these values.

In [16]:
# Use drop function to drop columns
data.drop('Metascore', axis=1).head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions)
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02


Using the above code, the ‘Metascore’ column is dropped completely from data. Here axis= 1 specifies that column is to be dropped. These changes will not take place in actual data unless we specify inplace=True as a parameter in the drop( ) function.

In [17]:
# Drops all rows containing missing data
data.dropna()

# Drop all columns containing missing data
data.dropna(axis=1)

data.dropna(axis=0, thresh=6)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,,45.0
996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.54,46.0
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.01,50.0
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,,22.0


In the above snippet, we are using thresh parameter to specify the minimum number of non-null values for the column/row to be held without dropping.

In our movies data, we know that there are some records where the Revenue is null.
We can impute these null values with mean Revenue (Millions).

In [18]:
revenue_mean = data_indexed['Revenue (Millions)'].mean()
print("The mean revenue is: ", revenue_mean)

The mean revenue is:  82.95637614678898


In [19]:
# We can fill the null values with this mean revenue
data_indexed['Revenue (Millions)'].fillna(revenue_mean, inplace=True)

# To check null values row-wise
has_nulls = data['Revenue (Millions)'].isnull().sum()

if not has_nulls:
    print("There are no null values in the Revenue column.")
else:
    print("There are null values in the Revenue column.")

There are null values in the Revenue column.


In [20]:
# Classify movies based on ratings
def rating_group(rating):
    if rating >= 7.5:
        return 'Good'
    elif rating >= 6.0:
        return 'Average'
    else:
        return 'Bad'

In [21]:
# Lets apply this function on our movies data
# creating a new variable in the dataset to hold the rating category
data['Rating_category'] = data['Rating'].apply(rating_group)

In [22]:
data[['Title','Director','Rating','Rating_category']].head(5)

Unnamed: 0,Title,Director,Rating,Rating_category
0,Guardians of the Galaxy,James Gunn,8.1,Good
1,Prometheus,Ridley Scott,7.0,Average
2,Split,M. Night Shyamalan,7.3,Average
3,Sing,Christophe Lourdelet,7.2,Average
4,Suicide Squad,David Ayer,6.2,Average
