In [657]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer

# BOM.MOVIE_GROSS

In [658]:
#loading data
bom_gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
bom_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


cleaning the data

In [659]:
# checking for duplicates
bom_gross.duplicated().sum()

0

In [660]:
bom_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [661]:
bom_gross.isna().mean()*100

title              0.000000
studio             0.147623
domestic_gross     0.826690
foreign_gross     39.858282
year               0.000000
dtype: float64

In [662]:
bom_gross.describe()

Unnamed: 0,domestic_gross,year
count,3359.0,3387.0
mean,28745850.0,2013.958075
std,66982500.0,2.478141
min,100.0,2010.0
25%,120000.0,2012.0
50%,1400000.0,2014.0
75%,27900000.0,2016.0
max,936700000.0,2018.0


In [663]:
bom_gross['foreign_gross']

0       652000000
1       691300000
2       664300000
3       535700000
4       513900000
          ...    
3382          NaN
3383          NaN
3384          NaN
3385          NaN
3386          NaN
Name: foreign_gross, Length: 3387, dtype: object

In [664]:
#Remove commas and convert 'foreign_gross' to numeric
bom_gross['foreign_gross'] = pd.to_numeric(bom_gross['foreign_gross'].str.replace(',', ''), errors='coerce')

In [665]:
bom_gross.describe()

Unnamed: 0,domestic_gross,foreign_gross,year
count,3359.0,2037.0,3387.0
mean,28745850.0,74872810.0,2013.958075
std,66982500.0,137410600.0,2.478141
min,100.0,600.0,2010.0
25%,120000.0,3700000.0,2012.0
50%,1400000.0,18700000.0,2014.0
75%,27900000.0,74900000.0,2016.0
max,936700000.0,960500000.0,2018.0


In [666]:
# creating a variable named  numeric columns for KNN Imputer
numeric_columns = ['domestic_gross', 'foreign_gross']
imputer = KNNImputer(n_neighbors=5)
bom_gross[numeric_columns] = imputer.fit_transform(bom_gross[numeric_columns])

In [667]:
bom_gross.isna().sum()

title             0
studio            5
domestic_gross    0
foreign_gross     0
year              0
dtype: int64

In [668]:
bom_gross.dropna(subset=['studio'], inplace=True)


In [669]:
bom_gross.isna().sum()

title             0
studio            0
domestic_gross    0
foreign_gross     0
year              0
dtype: int64

In [670]:
bom_gross.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3382 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3382 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3382 non-null   float64
 3   foreign_gross   3382 non-null   float64
 4   year            3382 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 158.5+ KB


In [671]:
# Creating a copy of the DataFrame
bom_gross_copy = bom_gross.copy()


# IM.DB DATABASE

#

In [672]:
#loading the data from the database
conn=sqlite3.connect("zippedData/im.db/im.db")
cur= conn.cursor()


In [673]:
# JOINING THE TABLES AND CONVERTING THE DATABASE FILE INTO DF
query = """ 
SELECT 
    mb.movie_id,
    mb.primary_title,
    mb.original_title,
    mb.genres,
    mb.start_year AS release_year,
    mb.runtime_minutes AS film_runtime,
    mr.averagerating AS imdb_average_rating,
    mr.numvotes AS imdb_number_of_votes,
    GROUP_CONCAT(DISTINCT(p.primary_name)) AS directors
FROM movie_basics AS mb
LEFT JOIN movie_ratings AS mr ON mb.movie_id = mr.movie_id
JOIN
directors AS d
USING (movie_id)
JOIN
persons AS p
USING (person_id)
GROUP BY mb.movie_id;
"""

imdb_df=pd.read_sql(query,conn)

DATA CLEANING

In [674]:
imdb_df.head()

Unnamed: 0,movie_id,primary_title,original_title,genres,release_year,film_runtime,imdb_average_rating,imdb_number_of_votes,directors
0,tt0063540,Sunghursh,Sunghursh,"Action,Crime,Drama",2013,175.0,7.0,77.0,Harnam Singh Rawail
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,"Biography,Drama",2019,114.0,7.2,43.0,Mani Kaul
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,Drama,2018,122.0,6.9,4517.0,Orson Welles
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,"Comedy,Drama",2018,,6.1,13.0,Hrishikesh Mukherjee
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,"Comedy,Drama,Fantasy",2017,80.0,6.5,119.0,"Raoul Ruiz,Valeria Sarmiento"


In [675]:
imdb_df.tail()

Unnamed: 0,movie_id,primary_title,original_title,genres,release_year,film_runtime,imdb_average_rating,imdb_number_of_votes,directors
140411,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,Drama,2019,123.0,,,Azhar Kinoi Lubis
140412,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,Documentary,2015,,,,"Angela Gurgel,Ana Célia de Oliveira"
140413,tt9916706,Dankyavar Danka,Dankyavar Danka,Comedy,2013,,,,Kanchan Nayak
140414,tt9916730,6 Gunn,6 Gunn,,2017,116.0,,,Kiran Gawade
140415,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,Documentary,2013,,,,"Vinicius Augusto Bozzo,Angela Gurgel"


In [676]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140416 entries, 0 to 140415
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   movie_id              140416 non-null  object 
 1   primary_title         140416 non-null  object 
 2   original_title        140414 non-null  object 
 3   genres                137238 non-null  object 
 4   release_year          140416 non-null  int64  
 5   film_runtime          112019 non-null  float64
 6   imdb_average_rating   73103 non-null   float64
 7   imdb_number_of_votes  73103 non-null   float64
 8   directors             140416 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 9.6+ MB


In [677]:
imdb_df.duplicated().sum()

0

In [678]:
imdb_df.isna().mean()*100

movie_id                 0.000000
primary_title            0.000000
original_title           0.001424
genres                   2.263275
release_year             0.000000
film_runtime            20.223479
imdb_average_rating     47.938269
imdb_number_of_votes    47.938269
directors                0.000000
dtype: float64

In [679]:
imdb_df.describe()

Unnamed: 0,release_year,film_runtime,imdb_average_rating,imdb_number_of_votes
count,140416.0,112019.0,73103.0,73103.0
mean,2014.61003,86.316741,6.329267,3559.32
std,2.732922,167.896407,1.47341,30447.56
min,2010.0,1.0,1.0,5.0
25%,2012.0,70.0,5.5,14.0
50%,2015.0,87.0,6.5,50.0
75%,2017.0,99.0,7.3,288.0
max,2115.0,51420.0,10.0,1841066.0


In [680]:
# Creating a copy of the DataFrame
imdb_df_copy = imdb_df.copy()


In [681]:
# handling missing values in numerical_columns
for i in ["film_runtime", "imdb_average_rating", "imdb_number_of_votes"]:
    imdb_df[i].fillna(imdb_df[i].median(), inplace=True)



In [682]:
imdb_df.columns

Index(['movie_id', 'primary_title', 'original_title', 'genres', 'release_year',
       'film_runtime', 'imdb_average_rating', 'imdb_number_of_votes',
       'directors'],
      dtype='object')

In [683]:
imdb_df.dropna(subset=['original_title', 'genres'], inplace=True)


In [684]:
imdb_df.isna().mean()*100

movie_id                0.0
primary_title           0.0
original_title          0.0
genres                  0.0
release_year            0.0
film_runtime            0.0
imdb_average_rating     0.0
imdb_number_of_votes    0.0
directors               0.0
dtype: float64

In [685]:
imdb_df=imdb_df.copy()

In [686]:
imdb= imdb_df[['movie_id','primary_title','genres','directors']]

In [687]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137237 entries, 0 to 140415
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   movie_id       137237 non-null  object
 1   primary_title  137237 non-null  object
 2   genres         137237 non-null  object
 3   directors      137237 non-null  object
dtypes: object(4)
memory usage: 5.2+ MB


In [688]:
# Creating a copy of the DataFrame
imdb = imdb.copy()


In [689]:
imdb=imdb.rename(columns={'directors': 'director'})
imdb= imdb.rename(columns={'genres': 'genre'})
imdb = imdb.rename(columns={'primary_title': 'title'})

# MERGING THE FIRST 2 DATASET

In [690]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137237 entries, 0 to 140415
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   movie_id  137237 non-null  object
 1   title     137237 non-null  object
 2   genre     137237 non-null  object
 3   director  137237 non-null  object
dtypes: object(4)
memory usage: 5.2+ MB


In [691]:
bom_gross.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3382 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3382 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3382 non-null   float64
 3   foreign_gross   3382 non-null   float64
 4   year            3382 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 158.5+ KB


In [692]:
movie_merged=pd.merge(imdb,bom_gross,on='title',how='inner')

In [693]:
movie_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3294 entries, 0 to 3293
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   movie_id        3294 non-null   object 
 1   title           3294 non-null   object 
 2   genre           3294 non-null   object 
 3   director        3294 non-null   object 
 4   studio          3294 non-null   object 
 5   domestic_gross  3294 non-null   float64
 6   foreign_gross   3294 non-null   float64
 7   year            3294 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 231.6+ KB


In [694]:
movie_merged.isna().mean()*100

movie_id          0.0
title             0.0
genre             0.0
director          0.0
studio            0.0
domestic_gross    0.0
foreign_gross     0.0
year              0.0
dtype: float64

In [695]:
movie_merged.duplicated().sum()

0

In [696]:
movie_merged=movie_merged.copy()

# MOVIE_INFO

In [697]:


# loading data for movie information
movie_info = pd.read_csv('zippedData/rt.movie_info.tsv.gz', delimiter='\t')
movie_info.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [698]:
movie_info.duplicated().sum()

0

In [699]:
movie_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [700]:
movie_info.isna().mean()*100

id               0.000000
synopsis         3.974359
rating           0.192308
genre            0.512821
director        12.756410
writer          28.782051
theater_date    23.012821
dvd_date        23.012821
currency        78.205128
box_office      78.205128
runtime          1.923077
studio          68.333333
dtype: float64

In [701]:
# dropping columns
movie_info.drop(columns=['currency', 'box_office','studio'],inplace=True)

In [702]:
categorical_columns = movie_info.select_dtypes(include="object").columns

for i in categorical_columns:
    movie_info[i].fillna('unknown', inplace=True)


In [703]:
# selecting useful columns
new_movie = movie_info[['id', 'genre', 'director', 'writer','rating']]
new_movie.head()

Unnamed: 0,id,genre,director,writer,rating
0,1,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,R
1,3,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,R
2,5,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,R
3,6,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,R
4,7,Drama|Romance,Rodney Bennett,Giles Cooper,NR


In [704]:
new_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1560 non-null   int64 
 1   genre     1560 non-null   object
 2   director  1560 non-null   object
 3   writer    1560 non-null   object
 4   rating    1560 non-null   object
dtypes: int64(1), object(4)
memory usage: 61.1+ KB


In [705]:
new_movie.isna().sum()

id          0
genre       0
director    0
writer      0
rating      0
dtype: int64

In [706]:
new_movie=new_movie.copy()

In [707]:
new_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1560 non-null   int64 
 1   genre     1560 non-null   object
 2   director  1560 non-null   object
 3   writer    1560 non-null   object
 4   rating    1560 non-null   object
dtypes: int64(1), object(4)
memory usage: 61.1+ KB


In [708]:
movie_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3294 entries, 0 to 3293
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   movie_id        3294 non-null   object 
 1   title           3294 non-null   object 
 2   genre           3294 non-null   object 
 3   director        3294 non-null   object 
 4   studio          3294 non-null   object 
 5   domestic_gross  3294 non-null   float64
 6   foreign_gross   3294 non-null   float64
 7   year            3294 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 231.6+ KB


# 2nd MERGING

In [709]:

# Merging `movie_merged` and `new_movie` on 'genre' and 'director'
movie_merged2 = pd.merge(movie_merged, new_movie, on='genre', how='inner')

# Display info to check the merged dataset
print("Merged Data (movie_merged + new_movie):")


movie_merged2.head()

Merged Data (movie_merged + new_movie):


Unnamed: 0,movie_id,title,genre,director_x,studio,domestic_gross,foreign_gross,year,id,director_y,writer,rating
0,tt2404548,On the Road,Drama,Harutyun Azaryan,IFC,744000.0,8000000.0,2012,13,Ray Lawrence,Raymond Carver|Beatrix Christian,R
1,tt2404548,On the Road,Drama,Harutyun Azaryan,IFC,744000.0,8000000.0,2012,14,Taylor Hackford,Mark Jacobson,R
2,tt2404548,On the Road,Drama,Harutyun Azaryan,IFC,744000.0,8000000.0,2012,23,unknown,unknown,R
3,tt2404548,On the Road,Drama,Harutyun Azaryan,IFC,744000.0,8000000.0,2012,35,Michael Polish,Michael Polish|Mark Polish,R
4,tt2404548,On the Road,Drama,Harutyun Azaryan,IFC,744000.0,8000000.0,2012,40,John Sayles,John Sayles,R


In [710]:
movie_merged2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72630 entries, 0 to 72629
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   movie_id        72630 non-null  object 
 1   title           72630 non-null  object 
 2   genre           72630 non-null  object 
 3   director_x      72630 non-null  object 
 4   studio          72630 non-null  object 
 5   domestic_gross  72630 non-null  float64
 6   foreign_gross   72630 non-null  float64
 7   year            72630 non-null  int64  
 8   id              72630 non-null  int64  
 9   director_y      72630 non-null  object 
 10  writer          72630 non-null  object 
 11  rating          72630 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 7.2+ MB


In [711]:
movie_merged2.isna().sum()

movie_id          0
title             0
genre             0
director_x        0
studio            0
domestic_gross    0
foreign_gross     0
year              0
id                0
director_y        0
writer            0
rating            0
dtype: int64

In [712]:
movie_merged2.duplicated().sum()

0

In [713]:
movie_merged2=movie_merged2.copy()

# MOVIE_BUDGET

In [714]:
# loading data for movie budget
movie_budget = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
movie_budget.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [715]:
movie_budget.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [716]:
movie_budget.describe()

Unnamed: 0,id
count,5782.0
mean,50.372363
std,28.821076
min,1.0
25%,25.0
50%,50.0
75%,75.0
max,100.0


In [717]:
movie_budget.columns

Index(['id', 'release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')

In [718]:
# Convert budget columns in movie_budget to numeric after removing $ and commas
for col in ['production_budget', 'domestic_gross', 'worldwide_gross']:
    movie_budget[col] = movie_budget[col].replace('[\$,]', '', regex=True).astype(float)

# Rename columns in movie_budget to match movie_merged2 if necessary
movie_budget = movie_budget.rename(columns={'movie': 'title'})



In [719]:
movie_budget=movie_budget.rename(columns={' release_date': 'year'})
movie_budget= movie_budget.rename(columns={'worldwide_gross':'foreign_gross'})

In [720]:
movie_budget.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5782 non-null   int64  
 1   release_date       5782 non-null   object 
 2   title              5782 non-null   object 
 3   production_budget  5782 non-null   float64
 4   domestic_gross     5782 non-null   float64
 5   foreign_gross      5782 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 271.2+ KB


In [721]:
movie_budget.isna().sum()

id                   0
release_date         0
title                0
production_budget    0
domestic_gross       0
foreign_gross        0
dtype: int64

In [722]:
movie_budget.duplicated().sum()

0

In [723]:
movie_merged2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72630 entries, 0 to 72629
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   movie_id        72630 non-null  object 
 1   title           72630 non-null  object 
 2   genre           72630 non-null  object 
 3   director_x      72630 non-null  object 
 4   studio          72630 non-null  object 
 5   domestic_gross  72630 non-null  float64
 6   foreign_gross   72630 non-null  float64
 7   year            72630 non-null  int64  
 8   id              72630 non-null  int64  
 9   director_y      72630 non-null  object 
 10  writer          72630 non-null  object 
 11  rating          72630 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 7.2+ MB


In [724]:
movie_budget.columns

Index(['id', 'release_date', 'title', 'production_budget', 'domestic_gross',
       'foreign_gross'],
      dtype='object')

In [725]:
movie_merged3=pd.merge(movie_merged,movie_budget,on='title',how='inner')
movie_merged3.head(20)

Unnamed: 0,movie_id,title,genre,director,studio,domestic_gross_x,foreign_gross_x,year,id,release_date,production_budget,domestic_gross_y,foreign_gross_y
0,tt0337692,On the Road,"Adventure,Drama,Romance",Walter Salles,IFC,744000.0,8000000.0,2012,17,"Mar 22, 2013",25000000.0,720828.0,9313302.0
1,tt2404548,On the Road,Drama,Harutyun Azaryan,IFC,744000.0,8000000.0,2012,17,"Mar 22, 2013",25000000.0,720828.0,9313302.0
2,tt3872966,On the Road,Documentary,Marvin Dau,IFC,744000.0,8000000.0,2012,17,"Mar 22, 2013",25000000.0,720828.0,9313302.0
3,tt4339118,On the Road,Drama,Poong-ki Kim,IFC,744000.0,8000000.0,2012,17,"Mar 22, 2013",25000000.0,720828.0,9313302.0
4,tt5389486,On the Road,Documentary,Maxim Sliussarev,IFC,744000.0,8000000.0,2012,17,"Mar 22, 2013",25000000.0,720828.0,9313302.0
5,tt5647250,On the Road,Drama,Michael Winterbottom,IFC,744000.0,8000000.0,2012,17,"Mar 22, 2013",25000000.0,720828.0,9313302.0
6,tt0359950,The Secret Life of Walter Mitty,"Adventure,Comedy,Drama",Ben Stiller,Fox,58200000.0,129900000.0,2013,37,"Dec 25, 2013",91000000.0,58236838.0,187861200.0
7,tt0365907,A Walk Among the Tombstones,"Action,Crime,Drama",Scott Frank,Uni.,26300000.0,26900000.0,2014,67,"Sep 19, 2014",28000000.0,26017685.0,62108590.0
8,tt0369610,Jurassic World,"Action,Adventure,Sci-Fi",Colin Trevorrow,Uni.,652300000.0,1019.4,2015,34,"Jun 12, 2015",215000000.0,652270625.0,1648855000.0
9,tt0376136,The Rum Diary,"Comedy,Drama",Bruce Robinson,FD,13100000.0,10800000.0,2011,16,"Oct 28, 2011",45000000.0,13109815.0,21544730.0


In [726]:
movie_merged3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1558 entries, 0 to 1557
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movie_id           1558 non-null   object 
 1   title              1558 non-null   object 
 2   genre              1558 non-null   object 
 3   director           1558 non-null   object 
 4   studio             1558 non-null   object 
 5   domestic_gross_x   1558 non-null   float64
 6   foreign_gross_x    1558 non-null   float64
 7   year               1558 non-null   int64  
 8   id                 1558 non-null   int64  
 9   release_date       1558 non-null   object 
 10  production_budget  1558 non-null   float64
 11  domestic_gross_y   1558 non-null   float64
 12  foreign_gross_y    1558 non-null   float64
dtypes: float64(5), int64(2), object(6)
memory usage: 170.4+ KB


In [727]:
movie_merged3=movie_merged3.copy()

In [728]:
# Remove exact duplicate rows
movie_merged3.drop_duplicates(inplace=True)


In [729]:
# Keep only the first occurrence of each title
movie_merged3.drop_duplicates(subset='title', keep='first',inplace=True)


In [730]:
movie_merged3

Unnamed: 0,movie_id,title,genre,director,studio,domestic_gross_x,foreign_gross_x,year,id,release_date,production_budget,domestic_gross_y,foreign_gross_y
0,tt0337692,On the Road,"Adventure,Drama,Romance",Walter Salles,IFC,744000.0,8000000.0,2012,17,"Mar 22, 2013",25000000.0,720828.0,9.313302e+06
6,tt0359950,The Secret Life of Walter Mitty,"Adventure,Comedy,Drama",Ben Stiller,Fox,58200000.0,129900000.0,2013,37,"Dec 25, 2013",91000000.0,58236838.0,1.878612e+08
7,tt0365907,A Walk Among the Tombstones,"Action,Crime,Drama",Scott Frank,Uni.,26300000.0,26900000.0,2014,67,"Sep 19, 2014",28000000.0,26017685.0,6.210859e+07
8,tt0369610,Jurassic World,"Action,Adventure,Sci-Fi",Colin Trevorrow,Uni.,652300000.0,1019.4,2015,34,"Jun 12, 2015",215000000.0,652270625.0,1.648855e+09
9,tt0376136,The Rum Diary,"Comedy,Drama",Bruce Robinson,FD,13100000.0,10800000.0,2011,16,"Oct 28, 2011",45000000.0,13109815.0,2.154473e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,tt7334528,Uncle Drew,"Comedy,Sport",Charles Stone III,LG/S,42500000.0,4200000.0,2018,85,"Jun 29, 2018",18000000.0,42469946.0,4.652716e+07
1554,tt7349662,BlacKkKlansman,"Biography,Crime,Drama",Spike Lee,Focus,49300000.0,44000000.0,2018,21,"Aug 10, 2018",15000000.0,49275340.0,9.301734e+07
1555,tt7388562,"Paul, Apostle of Christ","Adventure,Biography,Drama",Andrew Hyatt,Affirm,17600000.0,5500000.0,2018,95,"Mar 23, 2018",5000000.0,17547999.0,2.552950e+07
1556,tt7401588,Instant Family,"Comedy,Drama",Sean Anders,Par.,67400000.0,53200000.0,2018,20,"Nov 16, 2018",48000000.0,67363237.0,1.197362e+08


In [731]:
movie_merged3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1158 entries, 0 to 1557
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movie_id           1158 non-null   object 
 1   title              1158 non-null   object 
 2   genre              1158 non-null   object 
 3   director           1158 non-null   object 
 4   studio             1158 non-null   object 
 5   domestic_gross_x   1158 non-null   float64
 6   foreign_gross_x    1158 non-null   float64
 7   year               1158 non-null   int64  
 8   id                 1158 non-null   int64  
 9   release_date       1158 non-null   object 
 10  production_budget  1158 non-null   float64
 11  domestic_gross_y   1158 non-null   float64
 12  foreign_gross_y    1158 non-null   float64
dtypes: float64(5), int64(2), object(6)
memory usage: 126.7+ KB
