# Persons

In [312]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os

%matplotlib inline

In [313]:
# connect to the SQLite database
conn = sqlite3.connect('./extractedData/im.db')
cursor = conn.cursor()

In [314]:
# read the persons table into a pandas DataFrame
# and display its information
persons_df = pd.read_sql("SELECT * FROM persons;", conn)
persons_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   person_id           606648 non-null  object 
 1   primary_name        606648 non-null  object 
 2   birth_year          82736 non-null   float64
 3   death_year          6783 non-null    float64
 4   primary_profession  555308 non-null  object 
dtypes: float64(2), object(3)
memory usage: 23.1+ MB


In [315]:
# Check for missing values
persons_df.isnull().sum()

person_id                  0
primary_name               0
birth_year            523912
death_year            599865
primary_profession     51340
dtype: int64

## Data Preparation for Analysis
### Data Cleaning

We drop rows where death_year is not null.

Drop irrelevant columns

In [317]:
# Drop rows where death_year is not null
persons_df = persons_df[persons_df['death_year'].isna()]

# Drop the 'birth_year', 'death_year' columns
persons_df = persons_df.drop(columns=['birth_year', 'death_year'])

# Reset the index of the DataFrame
persons_df.reset_index(drop=True, inplace=True)


In [318]:
# Display the updated DataFrame
persons_df.dtypes

person_id             object
primary_name          object
primary_profession    object
dtype: object

duplicate_persons_primary_name helps identify and preview individuals with non-unique primary names in the dataset. We can say that many people can share the same name. We can not use primary_name to clean our data

In [319]:
# view duplicates using primary_name
duplicate_persons_primary_name = persons_df[persons_df.duplicated(subset=['primary_name'], keep=False)]
duplicate_persons_primary_name = duplicate_persons_primary_name.sort_values('primary_name')
duplicate_persons_primary_name.head()

Unnamed: 0,person_id,primary_name,primary_profession
381053,nm8956236,A. Venkatesh,producer
151115,nm1701176,A. Venkatesh,"cinematographer,camera_department,editor"
124660,nm10275444,A. Venkatesh,director
273669,nm4062141,A. Venkatesh,"director,actor,writer"
429984,nm6758318,A.J. Khan,producer


In [235]:
# Drop rows where primary_profession is 'miscellaneous' only
persons_df = persons_df[persons_df['primary_profession'].str.strip().str.lower() != 'miscellaneous']

# Display the updated DataFrame
print(f"Shape after dropping rows with primary_profession as 'miscellaneous': {persons_df.shape}")

Shape after dropping rows with primary_profession as 'miscellaneous': (74773, 4)


In [320]:
# Check for missing values again
persons_df.isna().sum()

person_id                 0
primary_name              0
primary_profession    50548
dtype: int64

In [321]:
# Display the updated DataFrame information
print("Updated DataFrame information after cleaning:")
persons_df.info()

Updated DataFrame information after cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599865 entries, 0 to 599864
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   person_id           599865 non-null  object
 1   primary_name        599865 non-null  object
 2   primary_profession  549317 non-null  object
dtypes: object(3)
memory usage: 13.7+ MB


In [322]:
# close the database connection
conn.close()

In [323]:
# Create 'extracted' folder if it doesn't exist
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# Save persons_df to a CSV file
persons_df.to_csv(f'{output_folder}/cleaned_persons_df.csv', index=False)

print("DataFrames have been successfully saved to the 'extracted' folder.")

DataFrames have been successfully saved to the 'extracted' folder.


## WRITERS

In [324]:
conn = sqlite3.connect('./extractedData/im.db')
cursor = conn.cursor()

In [325]:
writers_df = pd.read_sql("SELECT * FROM writers;", conn)
print(writers_df.shape)
writers_df.head()


(255873, 2)


Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0438973,nm0175726
2,tt0438973,nm1802864
3,tt0462036,nm1940585
4,tt0835418,nm0310087


In [326]:
# Check for missing values
writers_df.isna().sum()

movie_id     0
person_id    0
dtype: int64

In [327]:
# view duplicates using movie_id and person_id

duplicate_writers = writers_df[writers_df.duplicated(subset=['movie_id', 'person_id'], keep=False)]

duplicate_writers.count()

movie_id     104011
person_id    104011
dtype: int64

## Data Preparation for Analysis
### Data Cleaning

In [329]:
# drop duplicates from the DataFrame. retain the first occurrence
writers_df = writers_df.drop_duplicates(subset=['movie_id', 'person_id'], keep='first')

# check for duplicates in the DataFrame
duplicate_writers = writers_df.duplicated(subset=['movie_id', 'person_id'])

# count the number of duplicates
duplicate_writers.count()

178352

In [330]:
writers_df.shape

(178352, 2)

In [331]:
# Create 'cleaned_data' folder if it doesn't exist
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# Save writers_df to a CSV file
writers_df.to_csv(f'{output_folder}/cleaned_writers_df.csv', index=False)

print("writers_df has been successfully saved to the 'cleaned_data' folder.")

writers_df has been successfully saved to the 'cleaned_data' folder.


many movies has many writers

# Directors

In [332]:
directors_df = pd.read_sql("SELECT * FROM directors;", conn)
print(directors_df.shape)
directors_df.head()

(291174, 2)


Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0462036,nm1940585
2,tt0835418,nm0151540
3,tt0835418,nm0151540
4,tt0878654,nm0089502


## Data Preparation for Analysis
### Data Cleaning

In [333]:
# Check for missing values
directors_df.isna().sum()

movie_id     0
person_id    0
dtype: int64

In [334]:
# view duplicates using movie_id and person_id

duplicate_directors = directors_df[directors_df.duplicated(subset=['movie_id', 'person_id'], keep='first')]

duplicate_directors.count()

movie_id     127639
person_id    127639
dtype: int64

In [335]:
# drop duplicates from the DataFrame
directors_df = directors_df.drop_duplicates(subset=['movie_id', 'person_id'], keep='first')

# check for duplicates in the DataFrame
duplicate_directors = directors_df.duplicated(subset=['movie_id', 'person_id'])

# count the number of duplicates
duplicate_directors.count()

163535

In [337]:
# Create 'cleaned_data' folder if it doesn't exist
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# Save writers_df to a CSV file
directors_df.to_csv(f'{output_folder}/cleaned_directors_df.csv', index=False)

print("directors_df has been successfully saved to the 'cleaned_data' folder.")

directors_df has been successfully saved to the 'cleaned_data' folder.


In [338]:
#  close the database connection
conn.close()

In [339]:
cleaned_persons_df = pd.read_csv('./cleaned_data/cleaned_persons_df.csv')
cleaned_persons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599865 entries, 0 to 599864
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   person_id           599865 non-null  object
 1   primary_name        599865 non-null  object
 2   primary_profession  549317 non-null  object
dtypes: object(3)
memory usage: 13.7+ MB


In [340]:
# get cleaned writers_df and cleaned directors_df
cleaned_writers_df = pd.read_csv('./cleaned_data/cleaned_writers_df.csv')
cleaned_writers_df.info()

print('----'*20)

cleaned_directors_df = pd.read_csv('./cleaned_data/cleaned_directors_df.csv')
cleaned_directors_df.info()
# check the shape of cleaned_writers_df and cleaned_directors_df
cleaned_writers_df.shape, cleaned_directors_df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178352 entries, 0 to 178351
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   movie_id   178352 non-null  object
 1   person_id  178352 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163535 entries, 0 to 163534
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   movie_id   163535 non-null  object
 1   person_id  163535 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


((178352, 2), (163535, 2))

GET WRITERS DETAILS USING PERSON_ID USE persons_df. GET MOVIES WRITTEN BY THE WRITER USING MOVIE_ID USE movies_df


To count the number of `person_id` values in `cleaned_writers_df` that are found in `persons_df`



In [None]:
# # Check for duplicates in writer_details using 'movie_id' and 'person_id'
# duplicate_writers = writer_details.duplicated(subset=['movie_id', 'person_id'])

# # Count the number of duplicates
# num_duplicates = duplicate_writers.sum()
# print(f"Number of duplicate rows in writer_details: {num_duplicates}")

# # Display the duplicate rows if any
# if num_duplicates > 0:
#     duplicate_rows = writer_details[duplicate_writers]
#     print("Duplicate rows:")
#     print(duplicate_rows)
# else:
#     print("No duplicate rows found.")

Number of duplicate rows in writer_details: 0
No duplicate rows found.


In [None]:
# # drop irrelevant columns from writer_details DataFrame
# writer_details = writer_details.drop(columns=['birth_year'])
# # Reset the index of the DataFrame
# writer_details.reset_index(drop=True, inplace=True)
# # Display the updated DataFrame
# writer_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34871 entries, 0 to 34870
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   movie_id            34871 non-null  object
 1   person_id           34871 non-null  object
 2   primary_name        34871 non-null  object
 3   primary_profession  34871 non-null  object
dtypes: object(4)
memory usage: 1.1+ MB


We can say that 1 person writes many movies

In [None]:
# # drop irrelevant columns from director_details DataFrame
# director_details = writer_details.drop(columns=['birth_year'])
# # Reset the index of the DataFrame
# writer_details.reset_index(drop=True, inplace=True)
# # Display the updated DataFrame
# writer_details.info()

In [341]:
# combine movie_budgets, directors_details and writers_details
movie_budgets_df = pd.read_csv('./cleaned_data/movie_budgets.csv')
movie_budgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1381 entries, 0 to 1380
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movie_id           1381 non-null   object 
 1   movie_name         1381 non-null   object 
 2   release_year       1381 non-null   int64  
 3   runtime_minutes    1381 non-null   int64  
 4   genres             1381 non-null   object 
 5   averagerating      1381 non-null   float64
 6   numvotes           1381 non-null   int64  
 7   release_date       1381 non-null   object 
 8   production_budget  1381 non-null   int64  
 9   domestic_gross     1381 non-null   int64  
 10  worldwide_gross    1381 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 118.8+ KB


To get director and writer names from cleaned_persons_df, you can merge it with cleaned_directors_df and cleaned_writers_df on the person_id column.

In [342]:
# Merge cleaned_directors_df with cleaned_persons_df to get director names
directors_details = pd.merge(cleaned_directors_df, cleaned_persons_df, on='person_id', how='inner')

# Merge cleaned_writers_df with cleaned_persons_df to get writer names
writers_details = pd.merge(cleaned_writers_df, cleaned_persons_df, on='person_id', how='inner')

# Save the resulting DataFrames to CSV files
# output_folder = './cleaned_data'
# os.makedirs(output_folder, exist_ok=True)
# directors_details.to_csv(f'{output_folder}/directors_details.csv', index=False)
# writers_details.to_csv(f'{output_folder}/writers_details.csv', index=False)

# Display the shapes and first few rows of the resulting DataFrames

# directors_details.head(), 
writers_details.head()

Unnamed: 0,movie_id,person_id,primary_name,primary_profession
0,tt0285252,nm0899854,Tony Vitale,"producer,director,writer"
1,tt0438973,nm0175726,Steve Conrad,"writer,producer,director"
2,tt2358925,nm0175726,Steve Conrad,"writer,producer,director"
3,tt2543472,nm0175726,Steve Conrad,"writer,producer,director"
4,tt0359950,nm0175726,Steve Conrad,"writer,producer,director"


To merge `movie_budgets_df`, `directors_details`, and `writers_details` on the `movie_id` column



In [343]:
# Merge movie_budgets_df, directors_details, and writers_details on 'movie_id'
merged_df = pd.merge(movie_budgets_df, directors_details, on='movie_id', how='inner')
merged_df = pd.merge(merged_df, writers_details, on='movie_id', how='inner')

# Rename columns for clarity
merged_df.rename(columns={
    'person_id_x': 'director_person_id',  # Rename director's person_id column
    'person_id_y': 'writer_person_id',    # Rename writer's person_id column    
}, inplace=True)

# Display the resulting DataFrame
print(f"Shape of the merged DataFrame: {merged_df.shape}")
merged_df.head()

Shape of the merged DataFrame: (3998, 17)


Unnamed: 0,movie_id,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross,director_person_id,primary_name_x,primary_profession_x,writer_person_id,primary_name_y,primary_profession_y
0,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,nm0440415,Lawrence Kasanoff,"producer,writer,director",nm0220297,Sean Catherine Derek,"writer,miscellaneous,producer"
1,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,nm0440415,Lawrence Kasanoff,"producer,writer,director",nm0440415,Lawrence Kasanoff,"producer,writer,director"
2,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,nm0440415,Lawrence Kasanoff,"producer,writer,director",nm0923312,Joshua Wexler,"producer,writer,director"
3,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,nm0440415,Lawrence Kasanoff,"producer,writer,director",nm0295165,Brent V. Friedman,"producer,writer,director"
4,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,nm0440415,Lawrence Kasanoff,"producer,writer,director",nm0841854,Rebecca Swanson,"writer,miscellaneous"




### Explanation:
1. **First Merge**:
   - `pd.merge(movie_budgets_df, directors_details, on='movie_id', how='inner')`: Merges `movie_budgets_df` and `directors_details` on the `movie_id` column using an inner join.
2. **Second Merge**:
   - `pd.merge(merged_df, writers_details, on='movie_id', how='inner')`: Merges the result of the first merge with `writers_details` on the `movie_id` column using an inner join.
3. **Result**:
   - The resulting `merged_df` contains data from all three DataFrames where `movie_id` matches in all.

In [344]:
merged_df.columns

Index(['movie_id', 'movie_name', 'release_year', 'runtime_minutes', 'genres',
       'averagerating', 'numvotes', 'release_date', 'production_budget',
       'domestic_gross', 'worldwide_gross', 'director_person_id',
       'primary_name_x', 'primary_profession_x', 'writer_person_id',
       'primary_name_y', 'primary_profession_y'],
      dtype='object')

In [345]:
# Rename columns for clarity
merged_df.rename(columns={    
    'primary_name_x': 'director_name',  # Rename director's name column
    'primary_name_y': 'writer_name',    # Rename writer's name column
    'primary_profession_x': 'director_profession',  # Rename director's profession column
    'primary_profession_y': 'writer_profession',    # Rename writer's profession column
}, inplace=True)

To drop duplicate values in `director_ids`, `director_names`, `writer_ids`, and `writer_names`



In [346]:
# Group directors' IDs and names by movie_id, ensuring unique values
directors_grouped = merged_df.groupby('movie_id').agg({
    'director_person_id': lambda x: ', '.join(map(str, sorted(set(x)))),
    'director_name': lambda x: ', '.join(sorted(set(x)))
}).reset_index()

# Rename columns for directors
directors_grouped.rename(columns={
    'director_person_id': 'director_ids',
    'director_name': 'director_names'
}, inplace=True)

# Group writers' IDs and names by movie_id, ensuring unique values
writers_grouped = merged_df.groupby('movie_id').agg({
    'writer_person_id': lambda x: ', '.join(map(str, sorted(set(x)))),
    'writer_name': lambda x: ', '.join(sorted(set(x)))
}).reset_index()

# Rename columns for writers
writers_grouped.rename(columns={
    'writer_person_id': 'writer_ids',
    'writer_name': 'writer_names'
}, inplace=True)

# Merge the grouped data back into a single DataFrame
director_writers_df = pd.merge(directors_grouped, writers_grouped, on='movie_id', how='inner')

# Display the resulting DataFrame
print(f"Shape of the compressed DataFrame: {director_writers_df.shape}")
director_writers_df.tail()

Shape of the compressed DataFrame: (1362, 5)


Unnamed: 0,movie_id,director_ids,director_names,writer_ids,writer_names
1357,tt8043306,nm6773153,Ahsan Rahim,"nm3773554, nm6511211, nm6773153","Ahsan Rahim, Ali Zafar, Danyal Zafar"
1358,tt8155288,nm0484907,Christopher Landon,"nm0484907, nm1245146","Christopher Landon, Scott Lobdell"
1359,tt8580348,nm1919456,Manolo Caro,"nm0002645, nm0182499, nm0712330, nm2601560, nm...","Filippo Bologna, Paola Mammini, Paolo Costella..."
1360,tt8632862,nm0601619,Michael Moore,nm0601619,Michael Moore
1361,tt9024106,"nm0465484, nm0813301","Cary Solomon, Chuck Konzelman","nm0465484, nm0813301","Cary Solomon, Chuck Konzelman"


In [347]:
# check for null values in the DataFrame
director_writers_df.isna().sum()

movie_id          0
director_ids      0
director_names    0
writer_ids        0
writer_names      0
dtype: int64

In [348]:
# check if director_ids has separeted values
# director_writers_df['director_ids'].str.split(',').apply(lambda x: len(x) > 1).any()
# #  diplay director_ids has separeted values
x = director_writers_df[director_writers_df['director_ids'].str.split(',').apply(lambda x: len(x) > 1)].head(10)
# print(director_writers_df['director_ids'].str.split(',').apply(lambda x: len(x) > 1).sum())
x.head()

Unnamed: 0,movie_id,director_ids,director_names,writer_ids,writer_names
5,tt0383010,"nm0268370, nm0268380","Bobby Farrelly, Peter Farrelly","nm0148808, nm0268370, nm0268380","Bobby Farrelly, Mike Cerrone, Peter Farrelly"
6,tt0398286,"nm0397174, nm1977355","Byron Howard, Nathan Greno",nm1557594,Dan Fogelman
27,tt0458481,"nm0001675, nm0588340","Frank Miller, Robert Rodriguez",nm0588340,Frank Miller
34,tt0475290,"nm0001053, nm0001054","Ethan Coen, Joel Coen","nm0001053, nm0001054","Ethan Coen, Joel Coen"
38,tt0480687,"nm0268370, nm0268380","Bobby Farrelly, Peter Farrelly","nm0055937, nm0268370, nm0268380, nm0971919","Bobby Farrelly, Kevin Barnett, Pete Jones, Pet..."


In [349]:
# check if nm0004056 is in the director_ids
director_writers_df['director_ids'].str.contains('nm0004056').any()

True



### Explanation:
1. **Remove Duplicates**:
   - Use `set(x)` to ensure unique values for `director_person_id`, `director_name`, `writer_person_id`, and `writer_name`.
   - Use `sorted(set(x))` to sort the unique values for consistent ordering.

2. **Aggregation**:
   - Use `', '.join(...)` to concatenate the unique values into a single string.

3. **Result**:
   - The resulting `compressed_df` will have unique and sorted values in `director_ids`, `director_names`, `writer_ids`, and `writer_names`.

Let me know if you need further clarification!

To merge `director_writers_df` and `movie_budgets_df` on `movie_id`, you can use the following code:



In [350]:
# Merge director_writers_df and movie_budgets_df on 'movie_id'
final_merged_df = pd.merge(director_writers_df, movie_budgets_df, on='movie_id', how='inner')

# Display the resulting DataFrame
print(f"Shape of the final merged DataFrame: {final_merged_df.shape}")
final_merged_df.head()

Shape of the final merged DataFrame: (1362, 15)


Unnamed: 0,movie_id,director_ids,director_names,writer_ids,writer_names,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross
0,tt0249516,nm0440415,Lawrence Kasanoff,"nm0220297, nm0295165, nm0440415, nm0841854, nm...","Brent V. Friedman, Joshua Wexler, Lawrence Kas...",foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706
1,tt0359950,nm0001774,Ben Stiller,nm0175726,Steve Conrad,the secret life of walter mitty,2013,114,"adventure,comedy,drama",7.3,275300,2013-12-25,91000000,58236838,187861183
2,tt0365907,nm0291082,Scott Frank,"nm0088747, nm0291082","Lawrence Block, Scott Frank",a walk among the tombstones,2014,114,"action,crime,drama",6.5,105116,2014-09-19,28000000,26017685,62108587
3,tt0369610,nm1119880,Colin Trevorrow,"nm0415425, nm0798646, nm1119880, nm2081046","Amanda Silver, Colin Trevorrow, Derek Connolly...",jurassic world,2015,124,"action,adventure,sci-fi",7.0,539338,2015-06-12,215000000,652270625,1648854864
4,tt0376136,nm0732430,Bruce Robinson,nm0732430,Bruce Robinson,the rum diary,2011,119,"comedy,drama",6.2,94787,2011-10-28,45000000,13109815,21544732


In [353]:
# check if tt0448115 exists in the final_merged_df
cleaned_persons_df['person_id'].str.contains('nm0000127').any()

False

In [352]:
# Perform the merge with an indicator column
merged_with_indicator = pd.merge(
    director_writers_df, 
    movie_budgets_df, 
    on='movie_id', 
    how='outer', 
    indicator=True
)

# Filter rows that are only in movie_budgets_df
only_in_movie_budgets = merged_with_indicator[merged_with_indicator['_merge'] == 'right_only']

# Display the rows that are only in movie_budgets_df
print(f"Number of rows only in movie_budgets_df: {only_in_movie_budgets.shape[0]}")
only_in_movie_budgets.head()

Number of rows only in movie_budgets_df: 19


Unnamed: 0,movie_id,director_ids,director_names,writer_ids,writer_names,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross,_merge
1362,tt0817230,,,,,valentine's day,2010,125,"comedy,romance",5.7,107171,2010-02-12,52000000,110485654,217569328,right_only
1363,tt0872230,,,,,my soul to take,2010,107,"horror,mystery,thriller",4.8,18381,2010-10-08,25000000,14744435,16727470,right_only
1364,tt0892318,,,,,letters to juliet,2010,105,"adventure,comedy,drama",6.6,86137,2010-05-14,30000000,53032453,82148538,right_only
1365,tt1194417,,,,,casino jack,2010,108,"biography,comedy,crime",6.2,16191,2010-12-17,12500000,2039869,2272186,right_only
1366,tt1262416,,,,,scream 4,2011,111,"horror,mystery",6.2,118779,2011-04-15,40000000,38180928,95989590,right_only




### Explanation:
1. **Merge Operation**:
   - `pd.merge(director_writers_df, movie_budgets_df, on='movie_id', how='inner')` merges the two DataFrames on the `movie_id` column using an inner join.
   - This ensures that only rows with matching `movie_id` in both DataFrames are included.

2. **Result**:
   - The resulting `final_merged_df` contains all columns from both `director_writers_df` and `movie_budgets_df`.

Let me know if you need further assistance!

In [None]:
#final_merged_df get actors from cleaned_persons_df and principals

To get actors from `cleaned_persons_df` and `principals` and merge them into `final_merged_df`, you can use the following code:



In [357]:
# connect to the SQLite database
conn = sqlite3.connect('./extractedData/im.db')
cursor = conn.cursor()

In [358]:
# Load the principals table
principals_df = pd.read_sql("SELECT * FROM principals;", conn)

# Filter principals to include only actors
actors_df = principals_df[principals_df['category'].str.lower() == 'actor']

# Merge actors with cleaned_persons_df to get actor names
actors_details = pd.merge(actors_df, cleaned_persons_df, on='person_id', how='inner')

# Group actors' IDs and names by movie_id, ensuring unique values
actors_grouped = actors_details.groupby('movie_id').agg({
    'person_id': lambda x: ', '.join(map(str, sorted(set(x)))),  # Unique and sorted actor IDs
    'primary_name': lambda x: ', '.join(sorted(set(x)))  # Unique and sorted actor names
}).reset_index()

# Rename columns for clarity
actors_grouped.rename(columns={
    'person_id': 'actor_ids',
    'primary_name': 'actor_names'
}, inplace=True)

# Merge actors_grouped with final_merged_df
final_merged_df_2 = pd.merge(final_merged_df, actors_grouped, on='movie_id', how='inner')

# Display the resulting DataFrame
print(f"Shape of the final merged DataFrame with actors: {final_merged_df_2.shape}")
final_merged_df_2.head()

Shape of the final merged DataFrame with actors: (1318, 17)


Unnamed: 0,movie_id,director_ids,director_names,writer_ids,writer_names,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross,actor_ids,actor_names
0,tt0249516,nm0440415,Lawrence Kasanoff,"nm0220297, nm0295165, nm0440415, nm0841854, nm...","Brent V. Friedman, Joshua Wexler, Lawrence Kas...",foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,nm0000221,Charlie Sheen
1,tt0359950,nm0001774,Ben Stiller,nm0175726,Steve Conrad,the secret life of walter mitty,2013,114,"adventure,comedy,drama",7.3,275300,2013-12-25,91000000,58236838,187861183,"nm0001774, nm1789985","Ben Stiller, Jon Daly"
2,tt0365907,nm0291082,Scott Frank,"nm0088747, nm0291082","Lawrence Block, Scott Frank",a walk among the tombstones,2014,114,"action,crime,drama",6.5,105116,2014-09-19,28000000,26017685,62108587,"nm0000553, nm1092086, nm1405398, nm2933542","Boyd Holbrook, Dan Stevens, David Harbour, Lia..."
3,tt0369610,nm1119880,Colin Trevorrow,"nm0415425, nm0798646, nm1119880, nm2081046","Amanda Silver, Colin Trevorrow, Derek Connolly...",jurassic world,2015,124,"action,adventure,sci-fi",7.0,539338,2015-06-12,215000000,652270625,1648854864,"nm0695435, nm1339223","Chris Pratt, Ty Simpkins"
4,tt0376136,nm0732430,Bruce Robinson,nm0732430,Bruce Robinson,the rum diary,2011,119,"comedy,drama",6.2,94787,2011-10-28,45000000,13109815,21544732,"nm0000136, nm0000610, nm0001173, nm0728346","Aaron Eckhart, Giovanni Ribisi, Johnny Depp, M..."


In [361]:
# Create 'cleaned_data' folder if it doesn't exist
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# Save final_merged_df_2 to a CSV file
final_merged_df_2.to_csv(f'{output_folder}/final_merged_df_2.csv', index=False)

print("final_merged_df_2 has been successfully saved to the 'cleaned_data' folder.")

final_merged_df_2 has been successfully saved to the 'cleaned_data' folder.




### Explanation:
1. **Load Principals Table**:
   - Use `pd.read_sql` to load the `principals` table from the database.

2. **Filter for Actors**:
   - Filter rows where the `category` column is `'actor'`.

3. **Merge with `cleaned_persons_df`**:
   - Merge `actors_df` with `cleaned_persons_df` on `person_id` to get actor names.

4. **Group by `movie_id`**:
   - Group actors by `movie_id` and aggregate their IDs and names, ensuring unique and sorted values.

5. **Rename Columns**:
   - Rename the grouped columns to `actor_ids` and `actor_names`.

6. **Merge with `final_merged_df`**:
   - Merge the grouped actor details with `final_merged_df` on `movie_id`.

7. **Display Results**:
   - Print the shape and preview the updated `final_merged_df`.

Let me know if you need further clarification!

In [360]:
# Perform the merge with an indicator column
merged_with_indicator = pd.merge(
    final_merged_df, 
    final_merged_df_2, 
    on='movie_id', 
    how='outer', 
    indicator=True
)

# Filter rows that are only in final_merged_df
excluded_from_final_merged_df_2 = merged_with_indicator[merged_with_indicator['_merge'] == 'left_only']

# Drop the '_merge' column for clarity
excluded_from_final_merged_df_2 = excluded_from_final_merged_df_2.drop(columns=['_merge'])

# Display the resulting DataFrame
print(f"Shape of the data in final_merged_df but not in final_merged_df_2: {excluded_from_final_merged_df_2.shape}")
excluded_from_final_merged_df_2.head()

Shape of the data in final_merged_df but not in final_merged_df_2: (44, 31)


Unnamed: 0,movie_id,director_ids_x,director_names_x,writer_ids_x,writer_names_x,movie_name_x,release_year_x,runtime_minutes_x,genres_x,averagerating_x,...,runtime_minutes_y,genres_y,averagerating_y,numvotes_y,release_date_y,production_budget_y,domestic_gross_y,worldwide_gross_y,actor_ids,actor_names
50,tt0770802,nm0294825,Ron Fricke,"nm0294825, nm0536056","Mark Magidson, Ron Fricke",samsara,2011,102,"documentary,music",8.5,...,,,,,,,,,,
102,tt0978764,nm0811583,Zack Snyder,"nm0793122, nm0811583","Steve Shibuya, Zack Snyder",sucker punch,2011,110,"action,adventure,fantasy",6.1,...,,,,,,,,,,
118,tt1015471,nm2013046,Caryn Waechter,"nm1841035, nm2627176","Marilyn Fu, Steven Millhauser",the sisterhood of night,2014,104,"drama,mystery,thriller",6.2,...,,,,,,,,,,
127,tt1034415,nm0345174,Luca Guadagnino,"nm0000783, nm0630453, nm1738734","Daria Nicolodi, Dario Argento, David Kajganich",suspiria,2018,152,"fantasy,horror,mystery",6.8,...,,,,,,,,,,
141,tt1067774,nm0080120,Thomas Bezucha,"nm0080120, nm0086194, nm0535940, nm2719767, nm...","April Blair, Jules Bass, Kelly Bowe, Maria Mag...",monte carlo,2011,109,"adventure,comedy,family",5.8,...,,,,,,,,,,
