In [41]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os

%matplotlib inline

In [42]:
conn = sqlite3.connect('./extractedData/im.db')
cursor = conn.cursor()

In [43]:
movie_writers = pd.read_csv('./cleaned_data/movie_writers.csv')
movie_writers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movie_id           957 non-null    object 
 1   movie_name         957 non-null    object 
 2   release_year       957 non-null    int64  
 3   runtime_minutes    957 non-null    int64  
 4   genres             957 non-null    object 
 5   averagerating      957 non-null    float64
 6   numvotes           957 non-null    int64  
 7   release_date       957 non-null    object 
 8   production_budget  957 non-null    int64  
 9   domestic_gross     957 non-null    int64  
 10  worldwide_gross    957 non-null    int64  
 11  writers            957 non-null    object 
 12  writer_ids         957 non-null    object 
dtypes: float64(1), int64(6), object(6)
memory usage: 97.3+ KB


In [44]:
# Check for duplicates in the DataFrame
duplicates = movie_writers.duplicated(subset=['movie_id'])
duplicates_count = duplicates.sum()
print(f"Number of duplicate rows: {duplicates_count}")

Number of duplicate rows: 0


In [45]:
directors_df = pd.read_sql("SELECT * FROM directors;", conn)
directors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291174 entries, 0 to 291173
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   movie_id   291174 non-null  object
 1   person_id  291174 non-null  object
dtypes: object(2)
memory usage: 4.4+ MB


In [None]:
# check for duplicates in the DataFrame
duplicate_directors = directors_df.duplicated(subset=['movie_id', 'person_id'])
# sort the duplicates IN ascending order
duplicate_directors = directors_df[duplicate_directors].sort_values(by=['movie_id', 'person_id'])
# count the number of duplicates
duplicate_directors.count()

movie_id     127639
person_id    127639
dtype: int64

In [None]:
# drop duplicates from the DataFrame
directors_df = directors_df.drop_duplicates(subset=['movie_id', 'person_id'])
# check for duplicates in the DataFrame
duplicate_directors = directors_df.duplicated(subset=['movie_id', 'person_id'])

# count the number of duplicates
duplicate_directors.count()

movie_id     0
person_id    0
dtype: int64

To combine `movie_writers` and `directors_df` based on the `movie_id` column, you can use the `merge` function in pandas. Here's how you can do it:



In [54]:
# Merge movie_writers and directors_df on movie_id
combined_df = pd.merge(movie_writers, directors_df, on='movie_id', how='inner')

# Display the resulting DataFrame
print(f"Number of rows in the combined DataFrame: {len(combined_df)}")
combined_df.head()

Number of rows in the combined DataFrame: 1060


Unnamed: 0,movie_id,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross,writers,writer_ids,person_id
0,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,"Sean Catherine Derek, Lawrence Kasanoff, Joshu...","nm0220297, nm0440415, nm0923312, nm0295165",nm0440415
1,tt0359950,the secret life of walter mitty,2013,114,"adventure,comedy,drama",7.3,275300,2013-12-25,91000000,58236838,187861183,Steve Conrad,nm0175726,nm0001774
2,tt0365907,a walk among the tombstones,2014,114,"action,crime,drama",6.5,105116,2014-09-19,28000000,26017685,62108587,"Lawrence Block, Scott Frank","nm0088747, nm0291082",nm0291082
3,tt0369610,jurassic world,2015,124,"action,adventure,sci-fi",7.0,539338,2015-06-12,215000000,652270625,1648854864,"Rick Jaffa, Amanda Silver, Colin Trevorrow","nm0415425, nm0798646, nm1119880",nm1119880
4,tt0376136,the rum diary,2011,119,"comedy,drama",6.2,94787,2011-10-28,45000000,13109815,21544732,Bruce Robinson,nm0732430,nm0732430




### Explanation:
1. `pd.merge(movie_writers, directors_df, on='movie_id', how='inner')`: This merges the two DataFrames on the `movie_id` column using an inner join, which keeps only the rows with matching `movie_id` values in both DataFrames.
2. The resulting DataFrame, `combined_df`, will contain columns from both `movie_writers` and `directors_df`.

In [56]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1060 entries, 0 to 1059
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movie_id           1060 non-null   object 
 1   movie_name         1060 non-null   object 
 2   release_year       1060 non-null   int64  
 3   runtime_minutes    1060 non-null   int64  
 4   genres             1060 non-null   object 
 5   averagerating      1060 non-null   float64
 6   numvotes           1060 non-null   int64  
 7   release_date       1060 non-null   object 
 8   production_budget  1060 non-null   int64  
 9   domestic_gross     1060 non-null   int64  
 10  worldwide_gross    1060 non-null   int64  
 11  writers            1060 non-null   object 
 12  writer_ids         1060 non-null   object 
 13  person_id          1060 non-null   object 
dtypes: float64(1), int64(6), object(7)
memory usage: 124.2+ KB


In [None]:
# Rename person_id column to director_id
combined_df.rename(columns={ 'person_id': 'director_id'}, inplace=True)

In [59]:
# check for duplicates in the DataFrame
duplicate_movie_id = combined_df.duplicated(subset=['movie_id'])
# sort the duplicates IN ascending order
duplicate_movie_id = combined_df[duplicate_movie_id].sort_values(by=['movie_id'])
# count the number of duplicates
duplicate_movie_id.count()

movie_id             103
movie_name           103
release_year         103
runtime_minutes      103
genres               103
averagerating        103
numvotes             103
release_date         103
production_budget    103
domestic_gross       103
worldwide_gross      103
writers              103
writer_ids           103
director_id          103
dtype: int64

In [61]:
persons_df = pd.read_csv('./cleaned_data/cleaned_persons_df.csv')
persons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74773 entries, 0 to 74772
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   person_id           74773 non-null  object
 1   primary_name        74773 non-null  object
 2   birth_year          74773 non-null  int64 
 3   primary_profession  74773 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


Create a `director_names` column in `combined_df` by mapping `director_id` (from `combined_df`) to `primary_name` (from `persons_df`)



In [62]:
# Create a mapping of person_id to primary_name from persons_df
person_id_to_name = persons_df.set_index('person_id')['primary_name'].to_dict()

# Map the director_id in combined_df to primary_name using the mapping
combined_df['director_names'] = combined_df['director_id'].map(person_id_to_name)

# Display the updated DataFrame
combined_df.head()

Unnamed: 0,movie_id,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross,writers,writer_ids,director_id,director_names
0,tt0249516,foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,"Sean Catherine Derek, Lawrence Kasanoff, Joshu...","nm0220297, nm0440415, nm0923312, nm0295165",nm0440415,Lawrence Kasanoff
1,tt0359950,the secret life of walter mitty,2013,114,"adventure,comedy,drama",7.3,275300,2013-12-25,91000000,58236838,187861183,Steve Conrad,nm0175726,nm0001774,Ben Stiller
2,tt0365907,a walk among the tombstones,2014,114,"action,crime,drama",6.5,105116,2014-09-19,28000000,26017685,62108587,"Lawrence Block, Scott Frank","nm0088747, nm0291082",nm0291082,Scott Frank
3,tt0369610,jurassic world,2015,124,"action,adventure,sci-fi",7.0,539338,2015-06-12,215000000,652270625,1648854864,"Rick Jaffa, Amanda Silver, Colin Trevorrow","nm0415425, nm0798646, nm1119880",nm1119880,Colin Trevorrow
4,tt0376136,the rum diary,2011,119,"comedy,drama",6.2,94787,2011-10-28,45000000,13109815,21544732,Bruce Robinson,nm0732430,nm0732430,Bruce Robinson


In [69]:
# chech for duplicates in the DataFrame
x = combined_df.duplicated(subset=['director_id',])
# sort the duplicates IN ascending order
x = combined_df[x].sort_values(by ='director_id', ascending = True)
# count the number of duplicates
x.head(10)

Unnamed: 0,movie_id,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross,writers,writer_ids,director_id,director_names
717,tt2334873,blue jasmine,2013,98,drama,7.3,179453,2013-07-26,18000000,33404871,102912961,Woody Allen,nm0000095,nm0000095,Woody Allen
426,tt1605783,midnight in paris,2011,94,"comedy,fantasy,romance",7.7,356500,2011-05-20,30000000,56816662,162502774,Woody Allen,nm0000095,nm0000095,Woody Allen
570,tt1859650,to rome with love,2012,112,"comedy,music,romance",6.3,79381,2012-06-22,21500000,16684352,74326015,Woody Allen,nm0000095,nm0000095,Woody Allen
811,tt2872732,lucy,2014,89,"action,sci-fi,thriller",6.4,403194,2014-07-25,40000000,126573960,457507776,Luc Besson,nm0000108,nm0000108,Luc Besson
166,tt1205537,jack ryan: shadow recruit,2014,105,"action,drama,thriller",6.2,114913,2014-01-17,60000000,50577412,131377412,David Koepp,nm0462895,nm0000110,Kenneth Branagh
1041,tt6802308,the 15:17 to paris,2018,94,"biography,drama,thriller",5.2,21880,2018-02-09,30000000,36276286,56096200,Dorothy Blyskal,nm2980113,nm0000142,Clint Eastwood
433,tt1616195,j. edgar,2011,137,"biography,crime,drama",6.6,115925,2011-11-09,35000000,37306030,84606030,Dustin Lance Black,nm0085257,nm0000142,Clint Eastwood
515,tt1742044,jersey boys,2014,134,"biography,drama,music",6.8,30410,2014-06-20,40000000,47047013,65282732,Marshall Brickman,nm0108613,nm0000142,Clint Eastwood
675,tt2179136,american sniper,2014,133,"action,biography,drama",7.3,401915,2014-12-25,58000000,350126372,547326372,Jason Hall,nm0355699,nm0000142,Clint Eastwood
849,tt3263904,sully,2016,96,"biography,drama",7.5,202718,2016-09-09,60000000,125070033,238524556,Chesley Sullenberger,nm3314810,nm0000142,Clint Eastwood




### Explanation:
1. `set_index('person_id')['primary_name'].to_dict()`: Creates a dictionary where `person_id` is the key and `primary_name` is the value.
2. `map(person_id_to_name)`: Maps the `director_id` in `combined_df` to the corresponding `primary_name` using the dictionary.
3. The new column `director_names` is added to `combined_df`.

Let me know if you need further assistance!

MANY MOVIES HAVE MANY directors_df



### Explanation:
1. `directors_df['movie_id'].isin(movie_writers['movie_id'])` checks if each `movie_id` in `directors_df` exists in the `movie_writers` DataFrame.
2. The result is filtered to include only the rows from `directors_df` where the condition is `True`.
3. The `len(common_movies)` gives the count of common movies, and `common_movies.head()` displays the first few rows.

In [35]:
persons_df = pd.read_csv('./cleaned_data/cleaned_persons_df.csv')
persons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74773 entries, 0 to 74772
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   person_id           74773 non-null  object
 1   primary_name        74773 non-null  object
 2   birth_year          74773 non-null  int64 
 3   primary_profession  74773 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [38]:
# Merge common_movies with persons_df to fetch primary_name and primary_profession
common_movies_with_directors_details = pd.merge(
    common_movies, 
    persons_df[['person_id', 'primary_name', 'primary_profession']], 
    left_on='person_id', 
    right_on='person_id',
    how='inner'
)

# Display the resulting DataFrame
print(f"Number of rows in the merged DataFrame: {len(common_movies_with_directors_details)}")
common_movies_with_directors_details.head()

Number of rows in the merged DataFrame: 2691


Unnamed: 0,movie_id,person_id,primary_name,primary_profession
0,tt0999913,nm0527109,Rod Lurie,"writer,director,producer"
1,tt0999913,nm0527109,Rod Lurie,"writer,director,producer"
2,tt0999913,nm0527109,Rod Lurie,"writer,director,producer"
3,tt0999913,nm0527109,Rod Lurie,"writer,director,producer"
4,tt1125929,nm0000431,Taylor Hackford,"producer,director,writer"


To combine the `directors_df` DataFrame by grouping directors for each movie, you can use the `groupby` method and aggregate the directors into a single column. Here's how you can do it:



In [22]:
# Group directors by movie_id and combine their names into a single string
grouped_directors = directors_df.groupby('movie_id')['person_id'].apply(lambda x: ', '.join(x)).reset_index()

# Rename the columns for clarity
grouped_directors.columns = ['movie_id', 'directors']

# Display the resulting DataFrame
print(grouped_directors.head())

    movie_id                                   directors
0  tt0063540  nm0712540, nm0712540, nm0712540, nm0712540
1  tt0066787                                   nm0002411
2  tt0069049                        nm0000080, nm0000080
3  tt0069204                                   nm0611531
4  tt0100275  nm0765384, nm0749914, nm0765384, nm0749914


In [23]:
grouped_directors.head(10)

Unnamed: 0,movie_id,directors
0,tt0063540,"nm0712540, nm0712540, nm0712540, nm0712540"
1,tt0066787,nm0002411
2,tt0069049,"nm0000080, nm0000080"
3,tt0069204,nm0611531
4,tt0100275,"nm0765384, nm0749914, nm0765384, nm0749914"
5,tt0111414,nm0398271
6,tt0112502,nm6883878
7,tt0137204,nm0365480
8,tt0139613,"nm0518037, nm0023406"
9,tt0144449,nm0309428


In [25]:
grouped_directors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140417 entries, 0 to 140416
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   movie_id   140417 non-null  object
 1   directors  140417 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB




### Explanation:
1. `groupby('movie_id')`: Groups the `directors_df` DataFrame by the `movie_id` column.
2. `apply(lambda x: ', '.join(x))`: Combines all director names for each `movie_id` into a single string, separated by commas.
3. `reset_index()`: Resets the index to turn the grouped data back into a DataFrame.
4. The resulting DataFrame will have two columns: `movie_id` and `directors`.

Let me know if you need further modifications!

To check if `grouped_directors` is in `movie_writers` using the `movie_id` column, you can use the `merge` function or the `isin` method. Here's how you can do it:



In [26]:
# Check if movie_id in grouped_directors exists in movie_writers
common_movies = grouped_directors[grouped_directors['movie_id'].isin(movie_writers['movie_id'])]

# Display the result
print(f"Number of common movies: {len(common_movies)}")
common_movies.head()

Number of common movies: 957


Unnamed: 0,movie_id,directors
19,tt0249516,"nm0440415, nm0440415, nm0440415, nm0440415, nm..."
52,tt0359950,"nm0001774, nm0001774"
56,tt0365907,"nm0291082, nm0291082"
58,tt0369610,"nm1119880, nm1119880, nm1119880, nm1119880, nm..."
60,tt0376136,"nm0732430, nm0732430"


In [None]:
# re

To merge `movie_id` and `grouped_directors` with `movie_writers` based on the `movie_id` column, you can use the `merge` function. Here's how you can do it:



In [27]:
# Merge grouped_directors with movie_writers on movie_id
merged_df = pd.merge(grouped_directors, movie_writers, on='movie_id', how='inner')

# Display the resulting DataFrame
print(f"Number of merged rows: {len(merged_df)}")
merged_df.head()

Number of merged rows: 957


Unnamed: 0,movie_id,directors,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross,writers,writer_ids
0,tt0249516,"nm0440415, nm0440415, nm0440415, nm0440415, nm...",foodfight!,2012,91,"action,animation,comedy",1.9,8248,2012-12-31,45000000,0,73706,"Sean Catherine Derek, Lawrence Kasanoff, Joshu...","nm0220297, nm0440415, nm0923312, nm0295165"
1,tt0359950,"nm0001774, nm0001774",the secret life of walter mitty,2013,114,"adventure,comedy,drama",7.3,275300,2013-12-25,91000000,58236838,187861183,Steve Conrad,nm0175726
2,tt0365907,"nm0291082, nm0291082",a walk among the tombstones,2014,114,"action,crime,drama",6.5,105116,2014-09-19,28000000,26017685,62108587,"Lawrence Block, Scott Frank","nm0088747, nm0291082"
3,tt0369610,"nm1119880, nm1119880, nm1119880, nm1119880, nm...",jurassic world,2015,124,"action,adventure,sci-fi",7.0,539338,2015-06-12,215000000,652270625,1648854864,"Rick Jaffa, Amanda Silver, Colin Trevorrow","nm0415425, nm0798646, nm1119880"
4,tt0376136,"nm0732430, nm0732430",the rum diary,2011,119,"comedy,drama",6.2,94787,2011-10-28,45000000,13109815,21544732,Bruce Robinson,nm0732430




### Explanation:
1. `pd.merge(grouped_directors, movie_writers, on='movie_id', how='inner')`: Merges the two DataFrames on the `movie_id` column using an inner join, which keeps only the rows with matching `movie_id` values in both DataFrames.
2. The resulting DataFrame, `merged_df`, will contain columns from both `grouped_directors` and `movie_writers`.

Let me know if you need further assistance!



### Explanation:
1. `grouped_directors['movie_id'].isin(movie_writers['movie_id'])` checks if each `movie_id` in `grouped_directors` exists in the `movie_writers` DataFrame.
2. The result is filtered to include only the rows from `grouped_directors` where the condition is `True`.
3. The `len(common_movies)` gives the count of common movies, and `common_movies.head()` displays the first few rows.

Let me know if you need further assistance!