# Module Requirements

In [2]:
import pandas as pd #for dataframes
import numpy as np #for null values and numerical transformations
import ast # In dictionaries -> for converting string into list 

# Data Load

## credits.csv

In [3]:
# Load Credits CSV files
df = pd.read_csv('datasets/credits.csv')

In [4]:
# Print first five records
df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


## movies_metadata.csv

In [5]:
# Load Movie Metadata CSV file

meta = pd.read_csv('datasets/movies_metadata.csv', low_memory=False)

In [6]:
meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## data.csv

In [7]:
old_movie_data = pd.read_csv('MyData/data.csv')

In [8]:
old_movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   director_name  5043 non-null   object
 1   actor_1_name   5043 non-null   object
 2   actor_2_name   5043 non-null   object
 3   actor_3_name   5043 non-null   object
 4   genres         5043 non-null   object
 5   movie_title    5043 non-null   object
 6   comb           5043 non-null   object
dtypes: object(7)
memory usage: 275.9+ KB


# Data Preprocessing & Cleaning

In [9]:
# Quick overview of dataset columns,shape,size,datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [10]:
# Quick overview of movie dataset columns,shape,size,datatypes
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [11]:
# convert release_date from object to datetime
meta['release_date'] = pd.to_datetime(meta['release_date'],errors='coerce')

In [12]:
# Create a Year column using 
meta['year'] = meta['release_date'].dt.year

In [13]:
meta['year'].value_counts().sort_index()

1874.0       1
1878.0       1
1883.0       1
1887.0       1
1888.0       2
          ... 
2015.0    1905
2016.0    1604
2017.0     532
2018.0       5
2020.0       1
Name: year, Length: 135, dtype: int64

In [14]:
# Extract only data from year 2017
# Since we have already data till 2016 from previous preprocessing part1 -> "data.csv"
# data for 2018,2019,2020 will be processed in further preprocessing files

selected_df = meta.loc[meta['year']==2017,['genres','title','id','year']]

In [15]:
selected_df = selected_df.astype({'id':'int64','year':'int64'})

In [16]:
selected_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 532 entries, 26560 to 45465
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genres  532 non-null    object
 1   title   532 non-null    object
 2   id      532 non-null    int64 
 3   year    532 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 20.8+ KB


In [17]:
# merging dataframes 
data = selected_df.merge(df,on='id').rename(columns={'title':'movie_title'})

In [18]:
def genre_names(x):
    """
    Description :
    Parameters :
    Steps:
    Return: dataframe
    """
    
    genres = []
    st = ' '
    
    for i in x :
        if i.get('name') == 'Science Fiction':
            scifi = 'Sci-Fi'
            genres.append(scifi)
        else:
            genres.append(i.get('name'))
    if genres == []:
        return np.NaN
    else:
        return (st.join(genres))
    
def actors_names(dataframe):
    """
    Description :
    Parameters :
    Steps:
    Return: dataframe
    """
    lit_df = dataframe['cast'].map(lambda x: ast.literal_eval(x))
    print(f"Cast  !!!")
    l = []
    for num in range(len(lit_df)):
        x = [i.get('name') for i in lit_df[num]][:3]
        l.append(x)
    
    temp_df = pd.DataFrame(l,columns=['actor_1_name','actor_2_name','actor_3_name']).fillna(np.NaN)
    return temp_df


def director_names(x):
    """
    Description :
    Parameters :
    Steps:
    Return: dataframe
    """
    dct = []
    st = " "
    for i in x:
        if i.get('job')=='Director':
            dct.append(i.get('name'))
    if dct == []:
        return np.NaN
    else:
        return (st.join(dct))
    
    
# Create function to clean main dataframe

def clean_data(dataframe):

    # evaluates an expression node or a string containing a Python literal or container display

    genre_data = dataframe['genres'].map(lambda x: ast.literal_eval(x))
    genre_df = pd.DataFrame(genre_data.apply(lambda x:genre_names(x))).rename(columns={'genres':'genres_list'})
    print('Genres Done !!!')

    # getting seperate cast list as actor1,2,3
    actor_df = actors_names(dataframe)

    # getting director name according to movies
    crew_data = dataframe['crew'].map(lambda x: ast.literal_eval(x))
    director_df = pd.DataFrame(crew_data.apply(lambda x:director_names(x))).rename(columns={'crew':'director_name'})
    print("Director Done !!!")
    
    # lowercase Movie_title
    dataframe['movie_title'] = dataframe['movie_title'].str.lower()

    df_list = [dataframe, genre_df, actor_df,director_df]
    combind_df_list = pd.concat(df_list,axis=1).drop(columns=['cast','crew','genres','year','id'])
    dataframe = combind_df_list.copy()

    return dataframe


In [19]:
movie = clean_data(data).dropna(how='any').rename(columns = {'genres_list':'genres'})

Genres Done !!!
Cast  !!!
Director Done !!!


In [20]:
movie.sample(5)

Unnamed: 0,movie_title,genres,actor_1_name,actor_2_name,actor_3_name,director_name
57,rings,Horror,Matilda Anna Ingrid Lutz,Alex Roe,Johnny Galecki,F. Javier Gutiérrez
137,britney ever after,Drama TV Movie,Natasha Bassett,Jenna Berman,Matthew Harrison,Leslie Libman
460,78/52,Documentary,Guillermo del Toro,Peter Bogdanovich,Karyn Kusama,Alexandre O. Philippe
385,lucid dream,Sci-Fi Thriller,Go Soo,Sol Kyung-gu,Park Yoo-chun,Kim Joon-Sung
427,tubelight,Drama History War,Salman Khan,Zhu Zhu,Sohail Khan,Kabir Khan


In [21]:
movie['comb'] = movie['actor_1_name']+ ' ' +movie['actor_2_name']+ ' ' +movie['actor_3_name']+ ' ' + movie['director_name']+ ' ' + movie['genres']

In [676]:
movie.head()

Unnamed: 0,movie_title,id,genre,actor_1_name,actor_2_name,actor_3_name,director_name,comb
0,pirates of the caribbean: dead men tell no tales,166426,Adventure Action Fantasy Comedy,Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg,Johnny Depp Javier Bardem Geoffrey Rush Joachi...
1,justice league,141052,Action Adventure Fantasy Sci-Fi,Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder,Ben Affleck Henry Cavill Gal Gadot Zack Snyder...
2,thor: ragnarok,284053,Action Adventure Fantasy Sci-Fi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi,Chris Hemsworth Tom Hiddleston Cate Blanchett ...
3,guardians of the galaxy vol. 2,283995,Action Adventure Comedy Sci-Fi,Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn,Chris Pratt Zoe Saldana Dave Bautista James Gu...
4,the king's daughter,245842,Fantasy Action Adventure,Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara,Pierce Brosnan William Hurt Benjamin Walker Se...


# Data Merge

In [30]:
movie_updated = old_movie_data.append(movie).drop_duplicates(keep='last',ignore_index=True)

In [31]:
movie_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5377 entries, 0 to 5376
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   director_name  5377 non-null   object
 1   actor_1_name   5377 non-null   object
 2   actor_2_name   5377 non-null   object
 3   actor_3_name   5377 non-null   object
 4   genres         5377 non-null   object
 5   movie_title    5377 non-null   object
 6   comb           5377 non-null   object
dtypes: object(7)
memory usage: 294.2+ KB


# Export Dataset

In [32]:
movie_updated.to_csv("MyData/updated_movie_till_2017.csv",index=False)