In [1]:
import pandas as pd
import ast

In [32]:
# Read the Bollywood movies data from a CSV file into a DataFrame
df1 = pd.read_csv('/content/Bollywood_Movies1.csv',lineterminator='\n')
# Read the Hollywood movies data from a CSV file into a DataFrame
df2 = pd.read_csv('/content/HollyWood_Movies1.csv')

In [33]:
df1.shape

(1015, 16)

In [34]:
df2.shape

(2725, 16)

In [35]:
# Combine the data from df1 and df2 into a single DataFrame called combined_df
combined_df = pd.concat([df1,df2],ignore_index=True)

In [36]:
# Calculating the number of duplicate rows in the combined_df
combined_df.duplicated().sum()

0

In [37]:
# Counting the number of null values in each column of the combined_df
combined_df.isnull().sum()

Unnamed: 0           0
Title                0
Director             0
Cast                 0
id                   0
genres               0
imdb_id              1
original_language    0
overview             0
popularity           0
poster_path          0
release_date         0
runtime              0
title                0
vote_average         0
vote_count           0
dtype: int64

In [38]:
# removing rows with null values
combined_df = combined_df.dropna().reset_index(drop=True)

In [39]:
combined_df.shape

(3739, 16)

# Pre-Processing

In [40]:
# we only want release year , removing date & month
def get_year(date):
  return date[:4]

In [41]:
combined_df['release_date'] = combined_df['release_date'].apply(get_year)

In [42]:
# While retrieving movie data, I noticed that some TMDB IDs fetched information about movies released outside the range of 2012 to 2022.
# That's why removing those movies

In [43]:
combined_df['release_date'] = combined_df['release_date'].apply(ast.literal_eval)

In [44]:
len(combined_df[combined_df['release_date']>2022])

16

In [45]:
len(combined_df[combined_df['release_date']<2012])

135

In [46]:
combined_df = combined_df[combined_df['release_date']>=2012][combined_df['release_date']<=2022].reset_index(drop=True)

  combined_df = combined_df[combined_df['release_date']>=2012][combined_df['release_date']<=2022].reset_index(drop=True)


####  Since there were two titles, "Title" and "title," we retained rows/movies that had the same information in both features.

In [47]:
(combined_df['Title']!=combined_df['title']).sum()

175

In [48]:
combined_df = combined_df[combined_df['Title']==combined_df['title']].reset_index(drop=True)

# Removing rows with same "Title"
    because there are some movies with same name in hollywood and bollywood ....they might create a problem in future

In [50]:
combined_df['Title'].duplicated(keep=False).sum()

16

In [51]:
combined_df = combined_df.drop_duplicates(keep=False,subset=['Title']).reset_index(drop=True)

In [52]:
print("Shape of combined_df",combined_df.shape)

Shape of combined_df (3397, 16)


In [53]:
# getting genres in the proper format
def get_genre(text):
    cnvrt_lst = ast.literal_eval(text)
    genre = []
    for i in cnvrt_lst:
      genre.append(i['name'])

    return ",".join(genre)

In [54]:
combined_df['genres'] = combined_df['genres'].apply(get_genre)

In [55]:
# checking empty values in genre feature
(combined_df['genres']=='').sum()

37

In [56]:
# removing rows with empty genre values
combined_df = combined_df[combined_df['genres']!=''].reset_index(drop=True)

In [57]:
combined_df.shape

(3360, 16)

In [58]:
# converting runtime minutes into hours and minutes
def convert_to_hrsmin(minutes):
  hrs = minutes//60
  min = minutes%60
  return hrs+" hrs "+min+" min"

In [59]:
combined_df['runtime'] = combined_df['runtime'].apply(lambda x:str(x//60)+' hrs '+str(x%60)+' min')

#### While retrieving movie data, I noticed that the name "Sushant Singh Rajput" was incorrectly written as "Sushant Singh."

In [29]:
for i in combined_df.index:
  for j in combined_df.loc[i,'Cast'].split(','):
     if 'Sushant Singh'==j:
          combined_df.loc[i,'Cast'] = combined_df.loc[i,'Cast'].replace('Sushant Singh','Sushant Singh Rajput')

In [30]:
combined_df.to_csv('All_Movies1.csv')

In [31]:
combined_df.shape

(3360, 16)