In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
%matplotlib inline

In [4]:
## TN MOVIE BUDGETS

In [5]:
movie_budgets_df = pd.read_csv('./zippedData/tn.movie_budgets.csv.gz', encoding='utf-8')
movie_budgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [6]:
# Rename 'movie' column to 'movie_name'
movie_budgets_df.rename(columns={'movie': 'movie_name'}, inplace=True)

# Convert 'release_date' to datetime format
movie_budgets_df['release_date'] = pd.to_datetime(movie_budgets_df['release_date'])

# Standardize the 'movie_name' column
movie_budgets_df['movie_name'] = movie_budgets_df['movie_name'].str.strip().str.lower()

# Display the updated column names
print(movie_budgets_df.columns)
# Display the updated DataFrame
print(movie_budgets_df.dtypes)

Index(['id', 'release_date', 'movie_name', 'production_budget',
       'domestic_gross', 'worldwide_gross'],
      dtype='object')
id                            int64
release_date         datetime64[ns]
movie_name                   object
production_budget            object
domestic_gross               object
worldwide_gross              object
dtype: object


In [7]:
movie_budgets_df.isna().sum()

id                   0
release_date         0
movie_name           0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

### Explanation:
1. **Extract Year**: The `dt.year` attribute extracts the year from the `release_date` column.
2. **Identify Duplicates**: The `duplicated()` method identifies duplicate `movie_name` entries.
3. **Update Names**: For duplicates, the `movie_name` is updated by appending the release year in parentheses.

This ensures that duplicate movie names are uniquely identified by their release year.

In [8]:
# Extract the year from the 'release_date' column
movie_budgets_df['release_year'] = movie_budgets_df['release_date'].dt.year

# Check for duplicates in 'movie_name'
duplicates = movie_budgets_df[movie_budgets_df.duplicated(subset=['movie_name'], keep=False)]

# Update 'movie_name' for duplicates by appending the release year
movie_budgets_df.loc[duplicates.index, 'movie_name'] = (
    movie_budgets_df.loc[duplicates.index, 'movie_name'] + 
    " (" + movie_budgets_df.loc[duplicates.index, 'release_year'].astype(str) + ")"
)

# Display the updated DataFrame
print(movie_budgets_df[['movie_name', 'release_year']].head())

                                    movie_name  release_year
0                                       avatar          2009
1  pirates of the caribbean: on stranger tides          2011
2                                 dark phoenix          2019
3                      avengers: age of ultron          2015
4            star wars ep. viii: the last jedi          2017


In [9]:
# Filter movie names containing the word 'avenger'
avenger_movies = movie_budgets_df[movie_budgets_df['movie_name'].str.contains('the avenger', case=False, na=False)]

# Display the filtered DataFrame
print(f"Number of movies with 'avenger' in the name: {avenger_movies.shape[0]}")
avenger_movies[['movie_name', 'release_year']].head()

Number of movies with 'avenger' in the name: 2


Unnamed: 0,movie_name,release_year
26,the avengers (2012),2012
934,the avengers (1998),1998


In [10]:
# Remove special characters ('$' and ',') and convert to Int64
movie_budgets_df['production_budget'] = pd.to_numeric(
    movie_budgets_df['production_budget'].astype(str).str.replace('[\$,]', '', regex=True), errors='coerce'
).astype('Int64')

movie_budgets_df['domestic_gross'] = pd.to_numeric(
    movie_budgets_df['domestic_gross'].astype(str).str.replace('[\$,]', '', regex=True), errors='coerce'
).astype('Int64')

movie_budgets_df['worldwide_gross'] = pd.to_numeric(
    movie_budgets_df['worldwide_gross'].astype(str).str.replace('[\$,]', '', regex=True), errors='coerce'
).astype('Int64')

# Display the updated DataFrame
print(movie_budgets_df.dtypes)
movie_budgets_df.head()

id                            int64
release_date         datetime64[ns]
movie_name                   object
production_budget             Int64
domestic_gross                Int64
worldwide_gross               Int64
release_year                  int64
dtype: object


Unnamed: 0,id,release_date,movie_name,production_budget,domestic_gross,worldwide_gross,release_year
0,1,2009-12-18,avatar,425000000,760507625,2776345279,2009
1,2,2011-05-20,pirates of the caribbean: on stranger tides,410600000,241063875,1045663875,2011
2,3,2019-06-07,dark phoenix,350000000,42762350,149762350,2019
3,4,2015-05-01,avengers: age of ultron,330600000,459005868,1403013963,2015
4,5,2017-12-15,star wars ep. viii: the last jedi,317000000,620181382,1316721747,2017


In [11]:
# Drop rows where 'production_budget' is less than 0
movie_budgets_df = movie_budgets_df[movie_budgets_df['production_budget'] >= 0]

# Display the updated DataFrame
print(f"Shape after dropping rows with production_budget < 0: {movie_budgets_df.shape}")

Shape after dropping rows with production_budget < 0: (5782, 7)


In [24]:
# Drop rows where both 'domestic_gross' and 'worldwide_gross' are 0
movie_budgets_df = movie_budgets_df[
    ~((movie_budgets_df['domestic_gross'] == 0) & (movie_budgets_df['worldwide_gross'] == 0))
]

# Display the updated DataFrame shape
print(f"Shape after dropping rows with zero gross: {movie_budgets_df.shape}")

Shape after dropping rows with zero gross: (5415, 7)


In [31]:
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# Save movie_basics_df to a CSV file
movie_budgets_df.to_csv(f'{output_folder}/movie_budgets_df.csv', index=False)

print("DataFrames have been successfully saved to the 'extracted' folder.")

DataFrames have been successfully saved to the 'extracted' folder.


## COMBINE SQL DATA WITH MOVIE BUDGETS

To combine movies_df.csv and movie_budgets_df, you can use the following code:

In [34]:
# Load the cleaned movies_df.csv
movies_df = pd.read_csv('./cleaned_data/movies_df.csv')
movie_budgets_df = pd.read_csv('./cleaned_data/movie_budgets_df.csv')

movies_df.info()
print('-----------------------------')
movie_budgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10010 entries, 0 to 10009
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   movie_id         10010 non-null  object 
 1   primary_title    10010 non-null  object 
 2   start_year       10010 non-null  int64  
 3   runtime_minutes  10010 non-null  int64  
 4   genres           10010 non-null  object 
 5   averagerating    10010 non-null  float64
 6   numvotes         10010 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 547.5+ KB
-----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5415 entries, 0 to 5414
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5415 non-null   int64 
 1   release_date       5415 non-null   object
 2   movie_name         5415 non-null   object
 3   production_budget  5415 non-null   int

In [39]:
movies_df.head()
# print('-----------------------------')
# movies_df.head()

Unnamed: 0,movie_id,primary_title,start_year,runtime_minutes,genres,averagerating,numvotes
0,tt0069049,the other side of the wind,2018,122,drama,6.9,4517
1,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248
2,tt0315642,wazir,2016,103,"action,crime,drama",7.1,15378
3,tt0323808,the wicker tree,2011,96,"drama,horror",3.9,2328
4,tt0326965,in my sleep,2010,104,"drama,mystery,thriller",5.5,1889


In [40]:
# change primary_title to movie_name, start_year to release_year
movies_df.rename(columns={'primary_title': 'movie_name', 'start_year': 'release_year'}, inplace=True)

# Convert 'release_year' to datetime format
movies_df['release_year'] = pd.to_datetime(movies_df['release_year'], format='%Y', errors='coerce')

In [41]:
# Merge movies_df and movie_budgets_df on 'movie_name'
combined_data = pd.merge(
    movies_df,
    movie_budgets_df,
    on='movie_name',
    how='inner',  # Use 'inner' to keep only matching rows
    suffixes=('_movies', '_budgets')  # Add suffixes to distinguish columns
)

# Display the shape and the first few rows of the combined DataFrame
print(f"Shape of the combined DataFrame: {combined_data.shape}")
combined_data.head()

Shape of the combined DataFrame: (1426, 13)


Unnamed: 0,movie_id,movie_name,release_year_movies,runtime_minutes,genres,averagerating,numvotes,id,release_date,production_budget,domestic_gross,worldwide_gross,release_year_budgets
0,tt0249516,foodfight!,2012-01-01,91,"action,animation,comedy",1.9,8248,26,2012-12-31,45000000,0,73706,2012
1,tt0359950,the secret life of walter mitty,2013-01-01,114,"adventure,comedy,drama",7.3,275300,37,2013-12-25,91000000,58236838,187861183,2013
2,tt0365907,a walk among the tombstones,2014-01-01,114,"action,crime,drama",6.5,105116,67,2014-09-19,28000000,26017685,62108587,2014
3,tt0369610,jurassic world,2015-01-01,124,"action,adventure,sci-fi",7.0,539338,34,2015-06-12,215000000,652270625,1648854864,2015
4,tt0376136,the rum diary,2011-01-01,119,"comedy,drama",6.2,94787,16,2011-10-28,45000000,13109815,21544732,2011


In [42]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1425
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   movie_id              1426 non-null   object        
 1   movie_name            1426 non-null   object        
 2   release_year_movies   1426 non-null   datetime64[ns]
 3   runtime_minutes       1426 non-null   int64         
 4   genres                1426 non-null   object        
 5   averagerating         1426 non-null   float64       
 6   numvotes              1426 non-null   int64         
 7   id                    1426 non-null   int64         
 8   release_date          1426 non-null   object        
 9   production_budget     1426 non-null   int64         
 10  domestic_gross        1426 non-null   int64         
 11  worldwide_gross       1426 non-null   int64         
 12  release_year_budgets  1426 non-null   int64         
dtypes: datetime64[ns](

In [43]:
# check for duplicates in the combined DataFrame
duplicates_combined = combined_data[combined_data.duplicated(subset=['movie_name'], keep=False)]
duplicates_combined

Unnamed: 0,movie_id,movie_name,release_year_movies,runtime_minutes,genres,averagerating,numvotes,id,release_date,production_budget,domestic_gross,worldwide_gross,release_year_budgets


In [45]:
# check if release_years match in the combined DataFrame
release_years_match = combined_data[
    combined_data['release_year_movies'] != combined_data['release_year_budgets']
]
release_years_match[['movie_name', 'release_year_movies', 'release_year_budgets']].head(10)

Unnamed: 0,movie_name,release_year_movies,release_year_budgets
0,foodfight!,2012-01-01,2012
1,the secret life of walter mitty,2013-01-01,2013
2,a walk among the tombstones,2014-01-01,2014
3,jurassic world,2015-01-01,2015
4,the rum diary,2011-01-01,2011
5,the three stooges,2012-01-01,2012
6,tangled,2010-01-01,2010
7,john carter,2012-01-01,2012
8,action jackson,2014-01-01,1988
9,dinner for schmucks,2010-01-01,2010


In [50]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1425
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              1426 non-null   object 
 1   movie_name            1426 non-null   object 
 2   release_year_movies   1426 non-null   int64  
 3   runtime_minutes       1426 non-null   int64  
 4   genres                1426 non-null   object 
 5   averagerating         1426 non-null   float64
 6   numvotes              1426 non-null   int64  
 7   id                    1426 non-null   int64  
 8   release_date          1426 non-null   object 
 9   production_budget     1426 non-null   int64  
 10  domestic_gross        1426 non-null   int64  
 11  worldwide_gross       1426 non-null   int64  
 12  release_year_budgets  1426 non-null   int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 156.0+ KB


In [51]:
# Convert 'release_year_movies' and 'release_year_budgets' to year type
combined_data['release_year_movies'] = pd.to_datetime(combined_data['release_year_movies'], errors='coerce').dt.year
combined_data['release_year_budgets'] = pd.to_datetime(combined_data['release_year_budgets'], errors='coerce').dt.year

# Display the updated DataFrame
print(combined_data[['movie_name', 'release_year_movies', 'release_year_budgets']].head())

                        movie_name  release_year_movies  release_year_budgets
0                       foodfight!                 1970                  1970
1  the secret life of walter mitty                 1970                  1970
2      a walk among the tombstones                 1970                  1970
3                   jurassic world                 1970                  1970
4                    the rum diary                 1970                  1970


In [52]:
# Check if 'release_year_movies' and 'release_year_budgets' are not equal
mismatched_years = combined_data[combined_data['release_year_movies'] != combined_data['release_year_budgets']]

# Display the count and the mismatched rows
print(f"Number of rows with mismatched release years: {mismatched_years.shape[0]}")
mismatched_years[['movie_name', 'release_year_movies', 'release_year_budgets']].head()

Number of rows with mismatched release years: 0


Unnamed: 0,movie_name,release_year_movies,release_year_budgets


In [53]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1425
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              1426 non-null   object 
 1   movie_name            1426 non-null   object 
 2   release_year_movies   1426 non-null   int64  
 3   runtime_minutes       1426 non-null   int64  
 4   genres                1426 non-null   object 
 5   averagerating         1426 non-null   float64
 6   numvotes              1426 non-null   int64  
 7   id                    1426 non-null   int64  
 8   release_date          1426 non-null   object 
 9   production_budget     1426 non-null   int64  
 10  domestic_gross        1426 non-null   int64  
 11  worldwide_gross       1426 non-null   int64  
 12  release_year_budgets  1426 non-null   int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 156.0+ KB


In [None]:
# drop irelevant columns
drop_columns = [ 'id', 'release_year_budgets']
combined_data.drop(columns=drop_columns, inplace=True)
# Display the updated DataFrame


    movie_id                       movie_name  release_year_movies  \
0  tt0249516                       foodfight!                 1970   
1  tt0359950  the secret life of walter mitty                 1970   
2  tt0365907      a walk among the tombstones                 1970   
3  tt0369610                   jurassic world                 1970   
4  tt0376136                    the rum diary                 1970   

   runtime_minutes                   genres  averagerating  numvotes  \
0               91  action,animation,comedy            1.9      8248   
1              114   adventure,comedy,drama            7.3    275300   
2              114       action,crime,drama            6.5    105116   
3              124  action,adventure,sci-fi            7.0    539338   
4              119             comedy,drama            6.2     94787   

  release_date  production_budget  domestic_gross  worldwide_gross  
0   2012-12-31           45000000               0            73706  
1   2013

In [55]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1425
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   movie_id             1426 non-null   object 
 1   movie_name           1426 non-null   object 
 2   release_year_movies  1426 non-null   int64  
 3   runtime_minutes      1426 non-null   int64  
 4   genres               1426 non-null   object 
 5   averagerating        1426 non-null   float64
 6   numvotes             1426 non-null   int64  
 7   release_date         1426 non-null   object 
 8   production_budget    1426 non-null   int64  
 9   domestic_gross       1426 non-null   int64  
 10  worldwide_gross      1426 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 133.7+ KB


In [56]:
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# Save movie_basics_df to a CSV file
combined_data.to_csv(f'{output_folder}/cleaned_movie_budgets_df.csv', index=False)

print("DataFrames have been successfully saved to the 'extracted' folder.")

DataFrames have been successfully saved to the 'extracted' folder.


To find the excluded movie_name values (i.e., those present in movie_budgets_df but not in movies_df), you can use the following code:

In [None]:
# Identify excluded movie_names
# excluded_movie_names = movie_budgets_df[~movie_budgets_df['movie_name'].isin(movies_df['movie_name'])]

# # Display the excluded movie names
# print(f"Number of excluded movie names: {excluded_movie_names.shape[0]}")
# excluded_movie_names[['movie_name']].head()