# Persons

In [26]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os

%matplotlib inline

In [27]:
conn = sqlite3.connect('./extractedData/im.db')
cursor = conn.cursor()

In [28]:
persons_df = pd.read_sql("SELECT * FROM persons;", conn)
persons_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   person_id           606648 non-null  object 
 1   primary_name        606648 non-null  object 
 2   birth_year          82736 non-null   float64
 3   death_year          6783 non-null    float64
 4   primary_profession  555308 non-null  object 
dtypes: float64(2), object(3)
memory usage: 23.1+ MB


In [29]:
# Check for missing values
persons_df.isnull().sum()

person_id                  0
primary_name               0
birth_year            523912
death_year            599865
primary_profession     51340
dtype: int64

In [30]:
# Drop rows where death_year is not null
persons_df = persons_df[persons_df['death_year'].isna()]

# Drop rows where 'birth_year' or 'primary_profession' is null
persons_df = persons_df.dropna(subset=['birth_year', 'primary_profession'])

# Display the updated DataFrame
print(f"Shape after dropping rows: {persons_df.shape}")
persons_df.head()

Shape after dropping rows: (74867, 5)


Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession
8,nm0063618,Jeff Beal,1963.0,,"composer,music_department,soundtrack"
12,nm0065847,Michael Frost Beckner,1963.0,,"writer,producer,miscellaneous"
14,nm0066093,Ahmad Bedair,1945.0,,"actor,miscellaneous"
17,nm0066897,Mohammad-Ali Behboudi,1956.0,,actor
20,nm0067983,Krunoslav Belko,1971.0,,actor


In [31]:
# Drop the death_year column
persons_df = persons_df.drop(columns=['death_year'])

# Reset the index of the DataFrame
persons_df.reset_index(drop=True, inplace=True)

In [32]:
# Convert birth_year to integer
persons_df['birth_year'] = persons_df['birth_year'].astype(int)

# Display the updated DataFrame
persons_df.dtypes

person_id             object
primary_name          object
birth_year             int32
primary_profession    object
dtype: object

In [33]:
# view duplicates using primary_name
duplicate_persons_primary_name = persons_df[persons_df.duplicated(subset=['primary_name'], keep=False)]
duplicate_persons_primary_name = duplicate_persons_primary_name.sort_values('primary_name')
duplicate_persons_primary_name.head(10)

Unnamed: 0,person_id,primary_name,birth_year,primary_profession
55220,nm1750876,Adam Berry,1987,"actor,producer"
17951,nm0077468,Adam Berry,1966,"composer,soundtrack,music_department"
50798,nm2194534,Adam Green,1981,"soundtrack,actor,director"
44311,nm1725691,Adam Green,1977,"actor,producer,writer"
47147,nm1697112,Adam Green,1975,"writer,producer,director"
557,nm10330838,Adam Johnson,1972,actor
18192,nm0424453,Adam Johnson,1973,"actor,art_director,writer"
23457,nm0800885,Alan Simpson,1983,"actor,producer,executive"
26526,nm0800883,Alan Simpson,1931,"actor,miscellaneous"
66311,nm3416598,Alberto Iglesias,1975,"actor,writer"


In [34]:
# Drop rows where primary_profession is 'miscellaneous' only
persons_df = persons_df[persons_df['primary_profession'].str.strip().str.lower() != 'miscellaneous']

# Display the updated DataFrame
print(f"Shape after dropping rows with primary_profession as 'miscellaneous': {persons_df.shape}")

Shape after dropping rows with primary_profession as 'miscellaneous': (74773, 4)


In [35]:
persons_df.isna().sum()

person_id             0
primary_name          0
birth_year            0
primary_profession    0
dtype: int64

In [41]:
persons_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74773 entries, 0 to 74866
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   person_id           74773 non-null  object
 1   primary_name        74773 non-null  object
 2   birth_year          74773 non-null  int32 
 3   primary_profession  74773 non-null  object
dtypes: int32(1), object(3)
memory usage: 2.6+ MB


In [42]:
# close the database connection
conn.close()

In [46]:
# Create 'extracted' folder if it doesn't exist
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# Save persons_df to a CSV file
persons_df.to_csv(f'{output_folder}/cleaned_persons_df.csv', index=False)

print("DataFrames have been successfully saved to the 'extracted' folder.")

DataFrames have been successfully saved to the 'extracted' folder.


## WRITERS

In [48]:
conn = sqlite3.connect('./extractedData/im.db')
cursor = conn.cursor()

In [49]:
writers_df = pd.read_sql("SELECT * FROM writers;", conn)
print(writers_df.shape)
writers_df.head()


(255873, 2)


Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0438973,nm0175726
2,tt0438973,nm1802864
3,tt0462036,nm1940585
4,tt0835418,nm0310087


In [50]:
writers_df.isna().sum()

movie_id     0
person_id    0
dtype: int64

In [51]:
# view duplicates using movie_id

duplicate_writers_ids = writers_df[writers_df.duplicated(subset=['movie_id'], keep=False)]

duplicate_writers_ids.head()

Unnamed: 0,movie_id,person_id
1,tt0438973,nm0175726
2,tt0438973,nm1802864
4,tt0835418,nm0310087
5,tt0835418,nm0841532
6,tt0878654,nm0284943


many movies has many writers

GET WRITERS DETAILS USING PERSON_ID USE persons_df. GET MOVIES WRITTEN BY THE WRITER USING MOVIE_ID USE movies_df


In [52]:
cleaned_persons_df = pd.read_csv('./cleaned_data/cleaned_persons_df.csv')
cleaned_persons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74773 entries, 0 to 74772
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   person_id           74773 non-null  object
 1   primary_name        74773 non-null  object
 2   birth_year          74773 non-null  int64 
 3   primary_profession  74773 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB




### 1. **Filter `cleaned_persons_df` for Writers**


In [53]:
writers_only = cleaned_persons_df[cleaned_persons_df['primary_profession'].str.contains('writer', case=False, na=False)]

- **Purpose**: This filters the `cleaned_persons_df` DataFrame to include only rows where the `primary_profession` column contains the word "writer".
- **Key Parameters**:
  - `str.contains('writer')`: Checks if the string "writer" is present in the `primary_profession` column.
  - `case=False`: Makes the search case-insensitive (e.g., matches "Writer", "writer", or "WRITER").
  - `na=False`: Ensures that rows with `NaN` values in `primary_profession` are excluded from the filter.
- **Result**: A new DataFrame, `writers_only`, containing only persons whose primary profession includes "writer".

---

### 2. **Merge `writers_df` with `writers_only`**


In [54]:
writers_details = writers_df.merge(writers_only, on='person_id', how='inner')

- **Purpose**: Combines the `writers_df` DataFrame (containing writer-related data) with the filtered `writers_only` DataFrame to get detailed information about writers.
- **Key Parameters**:
  - `on='person_id'`: The merge is performed using the `person_id` column, which is common to both DataFrames.
  - `how='inner'`: Performs an inner join, keeping only rows where `person_id` exists in both DataFrames.
- **Result**: A new DataFrame, `writers_details`, containing detailed information about writers, including their `primary_profession` and other attributes from `cleaned_persons_df`.

---

### Final Output
The resulting `writers_details` DataFrame provides a consolidated view of writers, combining data from both `writers_df` and `cleaned_persons_df`. This is useful for further analysis, such as identifying movies written by specific writers or exploring writer-related trends.

In [55]:
print(f"Shape of writers_details DataFrame: {writers_details.shape}")

# Display the first few rows of the merged DataFrame
writers_details.head()

Shape of writers_details DataFrame: (41102, 5)


Unnamed: 0,movie_id,person_id,primary_name,birth_year,primary_profession
0,tt0285252,nm0899854,Tony Vitale,1964,"producer,director,writer"
1,tt0438973,nm0175726,Steve Conrad,1968,"writer,producer,director"
2,tt2358925,nm0175726,Steve Conrad,1968,"writer,producer,director"
3,tt2543472,nm0175726,Steve Conrad,1968,"writer,producer,director"
4,tt0359950,nm0175726,Steve Conrad,1968,"writer,producer,director"


We can say that 1 person writes many movies

In [56]:
# combine movie_budgets and writers_details
movie_budgets_df = pd.read_csv('./cleaned_data/movie_budgets.csv')
movie_budgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1381 entries, 0 to 1380
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movie_id           1381 non-null   object 
 1   movie_name         1381 non-null   object 
 2   release_year       1381 non-null   int64  
 3   runtime_minutes    1381 non-null   int64  
 4   genres             1381 non-null   object 
 5   averagerating      1381 non-null   float64
 6   numvotes           1381 non-null   int64  
 7   release_date       1381 non-null   object 
 8   production_budget  1381 non-null   int64  
 9   domestic_gross     1381 non-null   int64  
 10  worldwide_gross    1381 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 118.8+ KB


In [58]:
# Merge the two DataFrames on 'movie_id'
combined_movie_writers = pd.merge(
    writers_details,
    movie_budgets_df,
    on='movie_id',  # Common column to merge on
    how='inner'     # Use 'inner' to keep only matching rows
)

# Display the shape and the first few rows of the combined DataFrame
print(f"Shape of the combined DataFrame: {combined_movie_writers.shape}")
combined_movie_writers.head()

Shape of the combined DataFrame: (1825, 15)


Unnamed: 0,movie_id,person_id,primary_name,birth_year,primary_profession,movie_name,release_year,runtime_minutes,genres,averagerating,numvotes,release_date,production_budget,domestic_gross,worldwide_gross
0,tt2358925,nm0175726,Steve Conrad,1968,"writer,producer,director",unfinished business,2015,91,"comedy,drama",5.4,29004,2015-03-06,35000000,10219501,13214051
1,tt2543472,nm0175726,Steve Conrad,1968,"writer,producer,director",wonder,2017,113,"drama,family",8.0,111632,2017-11-17,20000000,132422809,304604712
2,tt2543472,nm2113666,Jack Thorne,1978,"writer,producer,actor",wonder,2017,113,"drama,family",8.0,111632,2017-11-17,20000000,132422809,304604712
3,tt2543472,nm0154716,Stephen Chbosky,1970,"writer,producer,director",wonder,2017,113,"drama,family",8.0,111632,2017-11-17,20000000,132422809,304604712
4,tt0359950,nm0175726,Steve Conrad,1968,"writer,producer,director",the secret life of walter mitty,2013,114,"adventure,comedy,drama",7.3,275300,2013-12-25,91000000,58236838,187861183


To group all `person_id` values into a single column for each movie, we modify the code to include both `writers` and `person_id` columns.


In [64]:
# Merge writers_details and movie_budgets_df on 'movie_id'
combined_movie_writers = pd.merge(
    movie_budgets_df,
    writers_details,
    on='movie_id',  # Common column to merge on
    how='inner'     # Use 'inner' to keep only matching rows
)

# Group by 'movie_id' and aggregate writers into a single column
combined_movie_writers['writers'] = combined_movie_writers.groupby('movie_id')['primary_name'].transform(lambda x: ', '.join(x))

# Group by 'movie_id' and aggregate person_id into a single column
combined_movie_writers['person_ids'] = combined_movie_writers.groupby('movie_id')['person_id'].transform(lambda x: ', '.join(map(str, x)))

# Drop duplicate rows to keep only one row per movie
combined_movie_writers = combined_movie_writers.drop_duplicates(subset=['movie_id'])

# Display the shape and the first few rows of the combined DataFrame
print(f"Shape of the combined DataFrame: {combined_movie_writers.shape}")
combined_movie_writers[['movie_id', 'movie_name', 'writers', 'person_ids']].head()

Shape of the combined DataFrame: (957, 17)


Unnamed: 0,movie_id,movie_name,writers,person_ids
0,tt0249516,foodfight!,"Sean Catherine Derek, Lawrence Kasanoff, Joshu...","nm0220297, nm0440415, nm0923312, nm0295165"
4,tt0359950,the secret life of walter mitty,Steve Conrad,nm0175726
5,tt0365907,a walk among the tombstones,"Lawrence Block, Scott Frank","nm0088747, nm0291082"
7,tt0369610,jurassic world,"Rick Jaffa, Amanda Silver, Colin Trevorrow","nm0415425, nm0798646, nm1119880"
10,tt0376136,the rum diary,Bruce Robinson,nm0732430




### Explanation:
1. **Merge DataFrames**:
   - Combines `movie_budgets_df` and `writers_details` on the `movie_id` column using an inner join.

2. **Create `writers` Column**:
   - Groups rows by `movie_id` and aggregates all writer names (`primary_name`) into a single string, separated by commas.

3. **Create `person_ids` Column**:
   - Groups rows by `movie_id` and aggregates all `person_id` values into a single string, separated by commas. The `map(str, x)` ensures that `person_id` values are converted to strings before joining.

4. **Remove Duplicates**:
   - Ensures only one row per `movie_id` remains in the DataFrame.

5. **Save to CSV**:
   - Saves the resulting DataFrame to a CSV file for further analysis.

### Result:
The resulting DataFrame will include:
- `movie_id`: Unique identifier for each movie.
- `movie_name`: Name of the movie.
- `writers`: A single column listing all writers for the movie.
- `person_ids`: A single column listing all `person_id` values for the writers of the movie.

In [65]:
combined_movie_writers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 957 entries, 0 to 1824
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            957 non-null    object 
 1   movie_name          957 non-null    object 
 2   release_year        957 non-null    int64  
 3   runtime_minutes     957 non-null    int64  
 4   genres              957 non-null    object 
 5   averagerating       957 non-null    float64
 6   numvotes            957 non-null    int64  
 7   release_date        957 non-null    object 
 8   production_budget   957 non-null    int64  
 9   domestic_gross      957 non-null    int64  
 10  worldwide_gross     957 non-null    int64  
 11  person_id           957 non-null    object 
 12  primary_name        957 non-null    object 
 13  birth_year          957 non-null    int64  
 14  primary_profession  957 non-null    object 
 15  writers             957 non-null    object 
 16  person_

In [70]:
# Drop unnecessary columns
columns_to_drop = ['person_id', 'primary_name', 'birth_year', 'primary_profession']
combined_movie_writers = combined_movie_writers.drop(columns=columns_to_drop)
# Display the shape and the first few rows of the cleaned DataFrame
print(f"Shape of the cleaned DataFrame: {combined_movie_writers.shape}")

Shape of the cleaned DataFrame: (957, 13)


In [72]:
# display the columns of the combined DataFrame
combined_movie_writers.columns

# convert release_date to datetime
combined_movie_writers['release_date'] = pd.to_datetime(combined_movie_writers['release_date'], errors='coerce')

In [73]:
conn.close()

In [75]:
combined_movie_writers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 957 entries, 0 to 1824
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   movie_id           957 non-null    object        
 1   movie_name         957 non-null    object        
 2   release_year       957 non-null    int64         
 3   runtime_minutes    957 non-null    int64         
 4   genres             957 non-null    object        
 5   averagerating      957 non-null    float64       
 6   numvotes           957 non-null    int64         
 7   release_date       957 non-null    datetime64[ns]
 8   production_budget  957 non-null    int64         
 9   domestic_gross     957 non-null    int64         
 10  worldwide_gross    957 non-null    int64         
 11  writers            957 non-null    object        
 12  person_ids         957 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(6), object(5)
memory usa

In [76]:
# # Save the combined DataFrame to a CSV file
output_folder = './cleaned_data'
os.makedirs(output_folder, exist_ok=True)
combined_movie_writers.to_csv(f'{output_folder}/movie_writers.csv', index=False)
print("Combined data has been saved successfully.")

Combined data has been saved successfully.
