In [184]:
# Perfom data manipulation and analysis.
import pandas as pd

# Perfoming mathematical calculations.
import numpy as np

# The two libraries below will aid in creating visualizations.
import matplotlib.pyplot as plt 
import seaborn as sns

In [185]:
# Read the CSV data file
df = pd.read_csv('Data/Marvel Vs DC New.csv', index_col = False)
df

Unnamed: 0,ID,Movie,Year,Genre,RunTime,Description,IMDB_Score
0,0,Eternals,-2021,"Action,Adventure,Drama",0,"The saga of the Eternals, a race of immortal b...",0.0
1,1,Loki,(2021– ),"Action,Adventure,Fantasy",0,A new Marvel chapter with Loki at its center.,0.0
2,2,The Falcon and the Winter Soldier,-2021,"Action,Adventure,Drama",50 min,"Following the events of 'Avengers: Endgame,' S...",7.5
3,3,WandaVision,-2021,"Action,Comedy,Drama",350 min,Blends the style of classic sitcoms with the M...,8.1
4,4,Spider-Man: No Way Home,-2021,"Action,Adventure,Sci-Fi",0,A continuation of Spider-Man: Far From Home.,0.0
...,...,...,...,...,...,...,...
1685,1685,DC's Legends of Tomorrow,(2016– ),"Action,Adventure,Drama",42 min,"Worlds lived, worlds died. Nothing will ever b...",8.5
1686,1686,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,"In the wake of Lex Luthor's return, the show f...",8.3
1687,1687,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,Kara comes face to face with Red Daughter and ...,8.1
1688,1688,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,Kara and Lena head to Kaznia to hunt down Lex....,7.4


## Data Understanding

In [186]:
# make a copy of the dataset for further analysis
entertainment_df = df.copy()
entertainment_df

Unnamed: 0,ID,Movie,Year,Genre,RunTime,Description,IMDB_Score
0,0,Eternals,-2021,"Action,Adventure,Drama",0,"The saga of the Eternals, a race of immortal b...",0.0
1,1,Loki,(2021– ),"Action,Adventure,Fantasy",0,A new Marvel chapter with Loki at its center.,0.0
2,2,The Falcon and the Winter Soldier,-2021,"Action,Adventure,Drama",50 min,"Following the events of 'Avengers: Endgame,' S...",7.5
3,3,WandaVision,-2021,"Action,Comedy,Drama",350 min,Blends the style of classic sitcoms with the M...,8.1
4,4,Spider-Man: No Way Home,-2021,"Action,Adventure,Sci-Fi",0,A continuation of Spider-Man: Far From Home.,0.0
...,...,...,...,...,...,...,...
1685,1685,DC's Legends of Tomorrow,(2016– ),"Action,Adventure,Drama",42 min,"Worlds lived, worlds died. Nothing will ever b...",8.5
1686,1686,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,"In the wake of Lex Luthor's return, the show f...",8.3
1687,1687,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,Kara comes face to face with Red Daughter and ...,8.1
1688,1688,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,Kara and Lena head to Kaznia to hunt down Lex....,7.4


In [187]:
''' Identify if column ID has any important relevance in our dataset .
    It appears it is a repetition of the row index.
    Now we should drop it permanently
'''
entertainment_df['ID'].value_counts()

1689    1
1160    1
1112    1
1114    1
1116    1
       ..
569     1
571     1
573     1
575     1
0       1
Name: ID, Length: 1690, dtype: int64

In [188]:
# Drop column ID
entertainment_df = entertainment_df.drop(columns='ID')
entertainment_df

Unnamed: 0,Movie,Year,Genre,RunTime,Description,IMDB_Score
0,Eternals,-2021,"Action,Adventure,Drama",0,"The saga of the Eternals, a race of immortal b...",0.0
1,Loki,(2021– ),"Action,Adventure,Fantasy",0,A new Marvel chapter with Loki at its center.,0.0
2,The Falcon and the Winter Soldier,-2021,"Action,Adventure,Drama",50 min,"Following the events of 'Avengers: Endgame,' S...",7.5
3,WandaVision,-2021,"Action,Comedy,Drama",350 min,Blends the style of classic sitcoms with the M...,8.1
4,Spider-Man: No Way Home,-2021,"Action,Adventure,Sci-Fi",0,A continuation of Spider-Man: Far From Home.,0.0
...,...,...,...,...,...,...
1685,DC's Legends of Tomorrow,(2016– ),"Action,Adventure,Drama",42 min,"Worlds lived, worlds died. Nothing will ever b...",8.5
1686,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,"In the wake of Lex Luthor's return, the show f...",8.3
1687,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,Kara comes face to face with Red Daughter and ...,8.1
1688,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,Kara and Lena head to Kaznia to hunt down Lex....,7.4


In [189]:
# Understand construction of columns in the dataset
entertainment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1690 entries, 0 to 1689
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie        1690 non-null   object 
 1   Year         1657 non-null   object 
 2   Genre        1683 non-null   object 
 3   RunTime      1690 non-null   object 
 4   Description  1690 non-null   object 
 5   IMDB_Score   1690 non-null   float64
dtypes: float64(1), object(5)
memory usage: 79.3+ KB


In [190]:
# From the Observation Above:
'''The dataset has a total of 6 columns
   5 columns are of Dtype object -> Movie, Year, Genre, RunTime, Description
   1 column is of Dtype float -> IMDB_SCORE
'''
print(' ')

 


**NOTE**

Columns to convert from objects to int -> RunTime

Columns to convert from objects to Datetime -> Year

In [191]:
# Clean RunTime column
# Strip additional spaces first before replacing data
entertainment_df['RunTime'] = entertainment_df['RunTime'].str.strip().str.replace(' min', '')
entertainment_df

Unnamed: 0,Movie,Year,Genre,RunTime,Description,IMDB_Score
0,Eternals,-2021,"Action,Adventure,Drama",0,"The saga of the Eternals, a race of immortal b...",0.0
1,Loki,(2021– ),"Action,Adventure,Fantasy",0,A new Marvel chapter with Loki at its center.,0.0
2,The Falcon and the Winter Soldier,-2021,"Action,Adventure,Drama",50,"Following the events of 'Avengers: Endgame,' S...",7.5
3,WandaVision,-2021,"Action,Comedy,Drama",350,Blends the style of classic sitcoms with the M...,8.1
4,Spider-Man: No Way Home,-2021,"Action,Adventure,Sci-Fi",0,A continuation of Spider-Man: Far From Home.,0.0
...,...,...,...,...,...,...
1685,DC's Legends of Tomorrow,(2016– ),"Action,Adventure,Drama",42,"Worlds lived, worlds died. Nothing will ever b...",8.5
1686,Supergirl,(2015–2021),"Action,Adventure,Drama",42,"In the wake of Lex Luthor's return, the show f...",8.3
1687,Supergirl,(2015–2021),"Action,Adventure,Drama",42,Kara comes face to face with Red Daughter and ...,8.1
1688,Supergirl,(2015–2021),"Action,Adventure,Drama",42,Kara and Lena head to Kaznia to hunt down Lex....,7.4


In [192]:
# convert RunTime column from object to integer type
entertainment_df['RunTime'] = entertainment_df['RunTime'].astype(int)

In [193]:
# Cleaning column Year
entertainment_df['Year']

0             -2021
1          (2021– )
2             -2021
3             -2021
4             -2021
           ...     
1685       (2016– )
1686    (2015–2021)
1687    (2015–2021)
1688    (2015–2021)
1689    (2015–2021)
Name: Year, Length: 1690, dtype: object

In [194]:
# Remove parentheses
entertainment_df['Year'] = entertainment_df['Year'].str.replace(r'[()]', '', regex=True)  
entertainment_df['Year']

0           -2021
1          2021– 
2           -2021
3           -2021
4           -2021
          ...    
1685       2016– 
1686    2015–2021
1687    2015–2021
1688    2015–2021
1689    2015–2021
Name: Year, Length: 1690, dtype: object

In [195]:
# Remove leading dashes
entertainment_df['Year'] = entertainment_df['Year'].str.replace(r'^-+', '', regex=True)    
entertainment_df['Year']

0            2021
1          2021– 
2            2021
3            2021
4            2021
          ...    
1685       2016– 
1686    2015–2021
1687    2015–2021
1688    2015–2021
1689    2015–2021
Name: Year, Length: 1690, dtype: object

In [196]:
# Remove trailing dashes
entertainment_df['Year'] = entertainment_df['Year'].str.replace(r'[–-]\s*$', '', regex=True)
entertainment_df['Year']

0            2021
1            2021
2            2021
3            2021
4            2021
          ...    
1685         2016
1686    2015–2021
1687    2015–2021
1688    2015–2021
1689    2015–2021
Name: Year, Length: 1690, dtype: object

In [197]:
# Statistical information
entertainment_df.describe()

Unnamed: 0,RunTime,IMDB_Score
count,1690.0,1690.0
mean,34.124852,6.877337
std,32.333141,2.690564
min,0.0,0.0
25%,21.0,6.9
50%,30.0,7.8
75%,43.0,8.4
max,527.0,9.6


## Data Preparation

In [198]:
entertainment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1690 entries, 0 to 1689
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie        1690 non-null   object 
 1   Year         1657 non-null   object 
 2   Genre        1683 non-null   object 
 3   RunTime      1690 non-null   int32  
 4   Description  1690 non-null   object 
 5   IMDB_Score   1690 non-null   float64
dtypes: float64(1), int32(1), object(4)
memory usage: 72.7+ KB


**Check for null values**

In [199]:
# Find the total number of null values in each column
entertainment_df.isna().sum()

Movie           0
Year           33
Genre           7
RunTime         0
Description     0
IMDB_Score      0
dtype: int64

In [200]:
entertainment_df = entertainment_df.dropna().reset_index()
entertainment_df

Unnamed: 0,index,Movie,Year,Genre,RunTime,Description,IMDB_Score
0,0,Eternals,2021,"Action,Adventure,Drama",0,"The saga of the Eternals, a race of immortal b...",0.0
1,1,Loki,2021,"Action,Adventure,Fantasy",0,A new Marvel chapter with Loki at its center.,0.0
2,2,The Falcon and the Winter Soldier,2021,"Action,Adventure,Drama",50,"Following the events of 'Avengers: Endgame,' S...",7.5
3,3,WandaVision,2021,"Action,Comedy,Drama",350,Blends the style of classic sitcoms with the M...,8.1
4,4,Spider-Man: No Way Home,2021,"Action,Adventure,Sci-Fi",0,A continuation of Spider-Man: Far From Home.,0.0
...,...,...,...,...,...,...,...
1645,1685,DC's Legends of Tomorrow,2016,"Action,Adventure,Drama",42,"Worlds lived, worlds died. Nothing will ever b...",8.5
1646,1686,Supergirl,2015–2021,"Action,Adventure,Drama",42,"In the wake of Lex Luthor's return, the show f...",8.3
1647,1687,Supergirl,2015–2021,"Action,Adventure,Drama",42,Kara comes face to face with Red Daughter and ...,8.1
1648,1688,Supergirl,2015–2021,"Action,Adventure,Drama",42,Kara and Lena head to Kaznia to hunt down Lex....,7.4
