<a href="https://colab.research.google.com/github/jagadish-samarla/imdb-bollywood-text-analysis/blob/master/IMDB_text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing requirements**

In [76]:
!pip install xlsxwriter



In [77]:
!pip install IMDbPY



Importing requirements


In [78]:
import numpy as np
import pandas as pd
from datetime import datetime, date
import os
import glob
from imdb import IMDb
from ast import literal_eval
from collections import Counter

Mounting google drive to access scraped output files 

In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reading raw Bollywood movies dataset

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Contelligenz-assignment/Bollywood Movies Dataset.xlsx')

In [None]:
df.head()

Unnamed: 0,Movie Title
0,3 Idiots
1,Taare Zameen Par
2,PK (film)
3,Dangal (film)
4,Rang De Basanti


In [None]:
df.describe()

Unnamed: 0,Movie Title
count,4000
unique,4000
top,Ginny Aur Johnny
freq,1


Cleaning movie titles.


1.   Removing special characters from the titles
2.   Replacing unknown character 'â' with ':' .



In [None]:
df['Movie Title'] = df['Movie Title'].str.replace(r'\W'," ")
df['Movie Title'] = df['Movie Title'].str.replace('â',":")
df.head(15)

Unnamed: 0,Movie Title
0,3 Idiots
1,Taare Zameen Par
2,PK film
3,Dangal film
4,Rang De Basanti
5,Lagaan
6,My Name Is Khan
7,Swades
8,Gangs of Wasseypur
9,Gangs of Wasseypur Part 2


Exporting cleaned movie titles list as 'cleaned_bollywoodlist.xlsx'. with sheet name : 'Movie Titles'

In [None]:
df.to_excel('/content/drive/MyDrive/Contelligenz-assignment/cleaned_bollywoodlist.xlsx', sheet_name='Movie Titles', index = False, engine= 'xlsxwriter')

# After exporting cleaned_bollywoodlist.xlsx 

1.   All the required data (Title, url,	Genre, Cast,	Crew,	Plot summary,	Plot keywords,	IMDB Ratings) will be scraped from the web using IDE         ( here I used pycharm)
2.   Then all the scraped output files uploaded manually to the drive.




In [None]:
!ls drive/MyDrive/Contelligenz-assignment/scraping\ output\ files/*xlsx


ls: cannot access 'drive/MyDrive/Contelligenz-assignment/scraping output files/*xlsx': No such file or directory


Fixing working directory to access the scraped output files

In [None]:
cwd = os.path.abspath('/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/*xlsx') 
excel_files = glob.glob(cwd)

In [None]:
cwd

'/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/*xlsx'

In [None]:
excel_files

['/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_585_589.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_589-620.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_621-665.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_666-670.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_671-799.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_800-803.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_804-843.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_844-850.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_851-863.xlsx',
 '/content/drive/MyDrive/Contelligenz-assignment/scraped_output_files/final_output_864-899.xlsx',
 '/content/drive/MyD

# concatenating all scraped output files as one dataframe i.e., 'df'. 

In [None]:
df = pd.DataFrame()
for file in excel_files:
  individual_xl = pd.read_excel(file)
  df = df.append(individual_xl, ignore_index=True) 


In [None]:
df.tail()

Unnamed: 0.2,Unnamed: 0,Title,url,Genre,Cast,Crew,Plot summary,Plot keywords,IMDB Ratings,Unnamed: 0.1,Movie Title
3818,580,Eklavya,https://www.imdb.com/title/tt0459605/,"['Action', 'Drama', 'Mystery', 'Thriller']","['Amitabh Bachchan', 'Saif Ali Khan', 'Sanjay ...","['Vidhu Vinod Chopra', 'Vidhu Vinod Chopra', '...",As the kingdom of Devigarh comes apart at the ...,"['kingdom', 'king', 'fort', 'prince', 'duty', ...",6.1,580.0,Eklavya The Royal Guard
3819,581,1942: A Love Story,https://www.imdb.com/title/tt0109010/,"['Action', 'Drama', 'History', 'Musical', 'Rom...","['Anil Kapoor', 'Jackie Shroff', 'Manisha Koir...","['Vidhu Vinod Chopra', 'Sanjay Leela Bhansali'...","A young Indian couple, both from wealthy backg...","['horse riding', 'explosion', 'bomb', 'british...",7.2,581.0,1942 A Love Story
3820,582,Raaz: The Mystery Continues,https://www.imdb.com/title/tt1340838/,"['Horror', 'Mystery', 'Romance', 'Thriller']","['Kangana Ranaut', 'Adhyayan Suman', 'Emraan H...","['Mohit Suri', 'Kumaar', 'Sayeed Qadri', 'Shag...",An artist comes to realize that the woman he h...,"['ghost', 'pregnant', 'model', 'second in tril...",5.6,582.0,Raaz The Mystery Continues
3821,583,Pushpaka Vimana,https://www.imdb.com/title/tt0251355/,"['Comedy', 'Romance']","['Kamal Haasan', 'Amala Akkineni', 'Tinnu Anan...","['Singeetam Srinivasa Rao', 'Singeetam Sriniva...",A silent movie about an unemployed young man.,"['carrying someone over shoulder', 'hotel', 'h...",8.6,583.0,Pushpaka Vimana 1987 film
3822,584,Humsafar,https://www.imdb.com/title/tt2403201/,"['Drama', 'Romance']",,,,,9.0,584.0,Humsafar


In [None]:
df.isnull().sum()

Unnamed: 0          0
Title              12
url                 3
Genre              15
Cast               18
Crew               18
Plot summary       19
Plot keywords      20
IMDB Ratings       15
Unnamed: 0.1     3238
Movie Title      3238
dtype: int64

removing unnecessary columns

In [None]:
del df['Unnamed: 0']
del df['Unnamed: 0.1']
del df['Movie Title']

In [None]:
df.tail()

Unnamed: 0,Title,url,Genre,Cast,Crew,Plot summary,Plot keywords,IMDB Ratings
3818,Eklavya,https://www.imdb.com/title/tt0459605/,"['Action', 'Drama', 'Mystery', 'Thriller']","['Amitabh Bachchan', 'Saif Ali Khan', 'Sanjay ...","['Vidhu Vinod Chopra', 'Vidhu Vinod Chopra', '...",As the kingdom of Devigarh comes apart at the ...,"['kingdom', 'king', 'fort', 'prince', 'duty', ...",6.1
3819,1942: A Love Story,https://www.imdb.com/title/tt0109010/,"['Action', 'Drama', 'History', 'Musical', 'Rom...","['Anil Kapoor', 'Jackie Shroff', 'Manisha Koir...","['Vidhu Vinod Chopra', 'Sanjay Leela Bhansali'...","A young Indian couple, both from wealthy backg...","['horse riding', 'explosion', 'bomb', 'british...",7.2
3820,Raaz: The Mystery Continues,https://www.imdb.com/title/tt1340838/,"['Horror', 'Mystery', 'Romance', 'Thriller']","['Kangana Ranaut', 'Adhyayan Suman', 'Emraan H...","['Mohit Suri', 'Kumaar', 'Sayeed Qadri', 'Shag...",An artist comes to realize that the woman he h...,"['ghost', 'pregnant', 'model', 'second in tril...",5.6
3821,Pushpaka Vimana,https://www.imdb.com/title/tt0251355/,"['Comedy', 'Romance']","['Kamal Haasan', 'Amala Akkineni', 'Tinnu Anan...","['Singeetam Srinivasa Rao', 'Singeetam Sriniva...",A silent movie about an unemployed young man.,"['carrying someone over shoulder', 'hotel', 'h...",8.6
3822,Humsafar,https://www.imdb.com/title/tt2403201/,"['Drama', 'Romance']",,,,,9.0


removing rows with null values

In [None]:
df_clean = df.dropna()


In [None]:
df_clean.tail()

Unnamed: 0,Title,url,Genre,Cast,Crew,Plot summary,Plot keywords,IMDB Ratings
3817,Aakrosh,https://www.imdb.com/title/tt1708453/,"['Action', 'Crime', 'Thriller']","['Ajay Devgn', 'Akshaye Khanna', 'Bipasha Basu...","['Priyadarshan', 'Robin Bhatt', 'Aditya Dhar',...",The Central Bureau of Investigation deputes tw...,"['violence against a woman', 'well', 'cell pho...",7.0
3818,Eklavya,https://www.imdb.com/title/tt0459605/,"['Action', 'Drama', 'Mystery', 'Thriller']","['Amitabh Bachchan', 'Saif Ali Khan', 'Sanjay ...","['Vidhu Vinod Chopra', 'Vidhu Vinod Chopra', '...",As the kingdom of Devigarh comes apart at the ...,"['kingdom', 'king', 'fort', 'prince', 'duty', ...",6.1
3819,1942: A Love Story,https://www.imdb.com/title/tt0109010/,"['Action', 'Drama', 'History', 'Musical', 'Rom...","['Anil Kapoor', 'Jackie Shroff', 'Manisha Koir...","['Vidhu Vinod Chopra', 'Sanjay Leela Bhansali'...","A young Indian couple, both from wealthy backg...","['horse riding', 'explosion', 'bomb', 'british...",7.2
3820,Raaz: The Mystery Continues,https://www.imdb.com/title/tt1340838/,"['Horror', 'Mystery', 'Romance', 'Thriller']","['Kangana Ranaut', 'Adhyayan Suman', 'Emraan H...","['Mohit Suri', 'Kumaar', 'Sayeed Qadri', 'Shag...",An artist comes to realize that the woman he h...,"['ghost', 'pregnant', 'model', 'second in tril...",5.6
3821,Pushpaka Vimana,https://www.imdb.com/title/tt0251355/,"['Comedy', 'Romance']","['Kamal Haasan', 'Amala Akkineni', 'Tinnu Anan...","['Singeetam Srinivasa Rao', 'Singeetam Sriniva...",A silent movie about an unemployed young man.,"['carrying someone over shoulder', 'hotel', 'h...",8.6


In [None]:
df_clean['IMDB ID'] = df_clean['url'].apply(lambda x: ''.join(filter(lambda i: i.isdigit(), str(x))))


In [None]:
df_clean.head()

Unnamed: 0.2,Unnamed: 0,Title,url,Genre,Cast,Crew,Plot summary,Plot keywords,IMDB Ratings,Unnamed: 0.1,Movie Title,IMDB ID
0,0,Kyaa Kool Hai Hum,https://www.imdb.com/title/tt0456500/,"['Comedy', 'Mystery']","['Tusshar Kapoor', 'Riteish Deshmukh', 'Isha K...","['Sangeeth Sivan', 'Sachin Yardi', 'Pankaj Tri...",Karan and Rahul are fashion designers and suri...,"['tape over mouth', 'indian sex comedy', 'fema...",6.1,,,456500
1,1,Action Jackson,https://www.imdb.com/title/tt0403935/,"['Action', 'Crime', 'Thriller']","['Ajay Devgn', 'Shahid Kapoor', 'Prabhas', 'Ya...","['Prabhu Deva', 'A.C. Mughil', 'Shiraz Ahmed',...",In order to escape crime and clutches of a rut...,"['character name as title', 'nickname as title...",3.3,,,403935
2,2,Manto,https://www.imdb.com/title/tt6923462/,"['Biography', 'Drama']","['Nawazuddin Siddiqui', 'Rasika Dugal', 'Tahir...","['Nandita Das', 'Nandita Das', 'Sachin Agarwal...",The film is a biographical account of writer S...,['biopic'],7.4,,,6923462
3,3,Pataakha,https://www.imdb.com/title/tt8396128/,"['Action', 'Comedy', 'Drama']","['Sanya Malhotra', 'Radhika Madan', 'Sunil Gro...","['Vishal Bhardwaj', 'Vishal Bhardwaj', 'Charan...",Based on Charan Singh Pathik's short story Do ...,['based on story'],7.2,,,8396128
4,0,Kabul Express,https://www.imdb.com/title/tt0770214/,"['Adventure', 'Comedy', 'Drama', 'Thriller', '...","['John Abraham', 'Arshad Warsi', 'Salman Shahi...","['Kabir Khan', 'Kabir Khan', 'Sandeep Shrivast...",A thrilling story spanning 48 hours of five in...,"['pakistani', 'taliban', 'journalist', 'afghan...",6.8,,,770214


In [None]:
ia = IMDb()

Getting release year using IMDBPy library .
We need IMDB ID to get the release year

In [None]:
def releaseDates(id):
  try:
    release_year = ia.get_movie(id)['year']
    return release_year
  except :
    return None

df_clean['Release year'] = df_clean['IMDB ID'].apply(lambda x: releaseDates(x))


Getting vote count using IMDBPy library.

In [None]:
#ids = ['3863552', '2631186','1105733', '0986264', '0403935','0456500']
def voteCount(id):
  try:
    movie = ia.get_movie(id)
    votes = movie.data['votes']
    return votes
  except:
    return None


    
df_clean['IMDB votes'] = df_clean['IMDB ID'].apply(lambda x: voteCount(str(x).lstrip('0')))



2021-09-07 12:32:18,607 CRITICAL [imdbpy] /usr/local/lib/python3.7/dist-packages/imdb/_exceptions.py:34: IMDbParserError exception raised; args: ('invalid movieID "": invalid literal for int() with base 10: \'\'',); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/imdb/parser/http/__init__.py", line 299, in _normalize_movieID
    return '%07d' % int(movieID)
ValueError: invalid literal for int() with base 10: ''
2021-09-07 12:33:34,161 CRITICAL [imdbpy] /usr/local/lib/python3.7/dist-packages/imdb/_exceptions.py:34: IMDbParserError exception raised; args: ('invalid movieID "": invalid literal for int() with base 10: \'\'',); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/imdb/parser/http/__init__.py", line 299, in _normalize_movieID
    return '%07d' % int(movieID)
ValueError: invalid literal for int() with base 10: ''
2021-09-07 12:39:03,574 CRITICAL [imdbpy] /usr/local/lib/python3.7/dist-packages/imd

As IMDBpy taking too long to find vote count and release year, data is exported immediately as .xlsx. To use it for further.

In [None]:
df_clean.to_excel('/content/drive/MyDrive/Contelligenz-assignment/complete_database (1).xlsx', sheet_name='Movies', index = False, engine= 'xlsxwriter')

In [80]:
df_clean = pd.read_excel('/content/drive/MyDrive/Contelligenz-assignment/complete_database (1).xlsx')

Finding age of content

In [81]:
df_clean['age_of_content (in years)'] = df_clean['Release year'].apply(lambda x: datetime.now().year - (x))

Finding popularity of content

In [82]:
for index, row in df_clean.iterrows():
  df_clean.loc[index, 'popularity_of_content'] = ( row['IMDB votes']*row['IMDB Ratings']) 

This function is used to convert strting containing lists to a list.

In [83]:
def strToList(s):
  try:
    return literal_eval(str(s))
  except:
    return []





In [84]:
df_clean['Cast_list'] = df_clean['Cast'].apply(lambda x: strToList(x))

In [85]:
df_clean.head()

Unnamed: 0,Title,url,Genre,Cast,Crew,Plot summary,Plot keywords,IMDB Ratings,IMDB ID,Release year,IMDB votes,average_rating,age_of_content (in years),popularity_of_content,Genre_list,Cast_list,Cast_popularity_score,Crew_popularity_score
0,Kyaa Kool Hai Hum,https://www.imdb.com/title/tt0456500/,"['Comedy', 'Mystery']","['Tusshar Kapoor', 'Riteish Deshmukh', 'Isha K...","['Sangeeth Sivan', 'Sachin Yardi', 'Pankaj Tri...",Karan and Rahul are fashion designers and suri...,"['tape over mouth', 'indian sex comedy', 'fema...",6.1,456500.0,2005.0,3386.0,-NA-,16.0,20654.6,"['Comedy', 'Mystery']","[Tusshar Kapoor, Riteish Deshmukh, Isha Koppik...",28,53
1,Action Jackson,https://www.imdb.com/title/tt0403935/,"['Action', 'Crime', 'Thriller']","['Ajay Devgn', 'Shahid Kapoor', 'Prabhas', 'Ya...","['Prabhu Deva', 'A.C. Mughil', 'Shiraz Ahmed',...",In order to escape crime and clutches of a rut...,"['character name as title', 'nickname as title...",3.3,403935.0,2014.0,3600.0,-NA-,7.0,11880.0,"['Action', 'Crime', 'Thriller']","[Ajay Devgn, Shahid Kapoor, Prabhas, Yami Gaut...",29,167
2,Manto,https://www.imdb.com/title/tt6923462/,"['Biography', 'Drama']","['Nawazuddin Siddiqui', 'Rasika Dugal', 'Tahir...","['Nandita Das', 'Nandita Das', 'Sachin Agarwal...",The film is a biographical account of writer S...,['biopic'],7.4,6923462.0,2018.0,4043.0,7.7,3.0,29918.2,"['Biography', 'Drama']","[Nawazuddin Siddiqui, Rasika Dugal, Tahir Raj ...",88,119
3,Pataakha,https://www.imdb.com/title/tt8396128/,"['Action', 'Comedy', 'Drama']","['Sanya Malhotra', 'Radhika Madan', 'Sunil Gro...","['Vishal Bhardwaj', 'Vishal Bhardwaj', 'Charan...",Based on Charan Singh Pathik's short story Do ...,['based on story'],7.2,8396128.0,2018.0,3970.0,7.5,3.0,28584.0,"['Action', 'Comedy', 'Drama']","[Sanya Malhotra, Radhika Madan, Sunil Grover, ...",36,103
4,Kabul Express,https://www.imdb.com/title/tt0770214/,"['Adventure', 'Comedy', 'Drama', 'Thriller', '...","['John Abraham', 'Arshad Warsi', 'Salman Shahi...","['Kabir Khan', 'Kabir Khan', 'Sandeep Shrivast...",A thrilling story spanning 48 hours of five in...,"['pakistani', 'taliban', 'journalist', 'afghan...",6.8,770214.0,2006.0,3296.0,-NA-,15.0,22412.8,"['Adventure', 'Comedy', 'Drama', 'Thriller', '...","[John Abraham, Arshad Warsi, Salman Shahid, Ha...",40,192


Function to calculate popularity

In [86]:
def population(string):
  li = list(string.split(","))
  return len(li)

df_clean['Cast_popularity_score'] = df_clean['Cast'].apply( lambda x: population(str(x)))
df_clean['Crew_popularity_score'] = df_clean['Crew'].apply( lambda x: population(str(x)))

In [87]:
df_clean.head()

Unnamed: 0,Title,url,Genre,Cast,Crew,Plot summary,Plot keywords,IMDB Ratings,IMDB ID,Release year,IMDB votes,average_rating,age_of_content (in years),popularity_of_content,Genre_list,Cast_list,Cast_popularity_score,Crew_popularity_score
0,Kyaa Kool Hai Hum,https://www.imdb.com/title/tt0456500/,"['Comedy', 'Mystery']","['Tusshar Kapoor', 'Riteish Deshmukh', 'Isha K...","['Sangeeth Sivan', 'Sachin Yardi', 'Pankaj Tri...",Karan and Rahul are fashion designers and suri...,"['tape over mouth', 'indian sex comedy', 'fema...",6.1,456500.0,2005.0,3386.0,-NA-,16.0,20654.6,"['Comedy', 'Mystery']","[Tusshar Kapoor, Riteish Deshmukh, Isha Koppik...",28,53
1,Action Jackson,https://www.imdb.com/title/tt0403935/,"['Action', 'Crime', 'Thriller']","['Ajay Devgn', 'Shahid Kapoor', 'Prabhas', 'Ya...","['Prabhu Deva', 'A.C. Mughil', 'Shiraz Ahmed',...",In order to escape crime and clutches of a rut...,"['character name as title', 'nickname as title...",3.3,403935.0,2014.0,3600.0,-NA-,7.0,11880.0,"['Action', 'Crime', 'Thriller']","[Ajay Devgn, Shahid Kapoor, Prabhas, Yami Gaut...",29,167
2,Manto,https://www.imdb.com/title/tt6923462/,"['Biography', 'Drama']","['Nawazuddin Siddiqui', 'Rasika Dugal', 'Tahir...","['Nandita Das', 'Nandita Das', 'Sachin Agarwal...",The film is a biographical account of writer S...,['biopic'],7.4,6923462.0,2018.0,4043.0,7.7,3.0,29918.2,"['Biography', 'Drama']","[Nawazuddin Siddiqui, Rasika Dugal, Tahir Raj ...",88,119
3,Pataakha,https://www.imdb.com/title/tt8396128/,"['Action', 'Comedy', 'Drama']","['Sanya Malhotra', 'Radhika Madan', 'Sunil Gro...","['Vishal Bhardwaj', 'Vishal Bhardwaj', 'Charan...",Based on Charan Singh Pathik's short story Do ...,['based on story'],7.2,8396128.0,2018.0,3970.0,7.5,3.0,28584.0,"['Action', 'Comedy', 'Drama']","[Sanya Malhotra, Radhika Madan, Sunil Grover, ...",36,103
4,Kabul Express,https://www.imdb.com/title/tt0770214/,"['Adventure', 'Comedy', 'Drama', 'Thriller', '...","['John Abraham', 'Arshad Warsi', 'Salman Shahi...","['Kabir Khan', 'Kabir Khan', 'Sandeep Shrivast...",A thrilling story spanning 48 hours of five in...,"['pakistani', 'taliban', 'journalist', 'afghan...",6.8,770214.0,2006.0,3296.0,-NA-,15.0,22412.8,"['Adventure', 'Comedy', 'Drama', 'Thriller', '...","[John Abraham, Arshad Warsi, Salman Shahid, Ha...",40,192


In [88]:
df_final = df_clean.drop(['average_rating', 'url', 'Genre_list', 'Cast_list'],axis =1)

Re arranging columns


In [89]:
cols = df_final.columns
cols

Index(['Title', 'Genre', 'Cast', 'Crew', 'Plot summary', 'Plot keywords',
       'IMDB Ratings', 'IMDB ID', 'Release year', 'IMDB votes',
       'age_of_content (in years)', 'popularity_of_content',
       'Cast_popularity_score', 'Crew_popularity_score'],
      dtype='object')

In [90]:
df_final = df_final[['Title', 'IMDB ID', 'Release year', 'Genre', 'Cast', 'Crew', 
                    'Plot summary', 'Plot keywords', 'IMDB Ratings', 'IMDB votes',
                    'age_of_content (in years)', 'popularity_of_content', 
                    'Cast_popularity_score', 'Crew_popularity_score']]

In [91]:
df_final.head()

Unnamed: 0,Title,IMDB ID,Release year,Genre,Cast,Crew,Plot summary,Plot keywords,IMDB Ratings,IMDB votes,age_of_content (in years),popularity_of_content,Cast_popularity_score,Crew_popularity_score
0,Kyaa Kool Hai Hum,456500.0,2005.0,"['Comedy', 'Mystery']","['Tusshar Kapoor', 'Riteish Deshmukh', 'Isha K...","['Sangeeth Sivan', 'Sachin Yardi', 'Pankaj Tri...",Karan and Rahul are fashion designers and suri...,"['tape over mouth', 'indian sex comedy', 'fema...",6.1,3386.0,16.0,20654.6,28,53
1,Action Jackson,403935.0,2014.0,"['Action', 'Crime', 'Thriller']","['Ajay Devgn', 'Shahid Kapoor', 'Prabhas', 'Ya...","['Prabhu Deva', 'A.C. Mughil', 'Shiraz Ahmed',...",In order to escape crime and clutches of a rut...,"['character name as title', 'nickname as title...",3.3,3600.0,7.0,11880.0,29,167
2,Manto,6923462.0,2018.0,"['Biography', 'Drama']","['Nawazuddin Siddiqui', 'Rasika Dugal', 'Tahir...","['Nandita Das', 'Nandita Das', 'Sachin Agarwal...",The film is a biographical account of writer S...,['biopic'],7.4,4043.0,3.0,29918.2,88,119
3,Pataakha,8396128.0,2018.0,"['Action', 'Comedy', 'Drama']","['Sanya Malhotra', 'Radhika Madan', 'Sunil Gro...","['Vishal Bhardwaj', 'Vishal Bhardwaj', 'Charan...",Based on Charan Singh Pathik's short story Do ...,['based on story'],7.2,3970.0,3.0,28584.0,36,103
4,Kabul Express,770214.0,2006.0,"['Adventure', 'Comedy', 'Drama', 'Thriller', '...","['John Abraham', 'Arshad Warsi', 'Salman Shahi...","['Kabir Khan', 'Kabir Khan', 'Sandeep Shrivast...",A thrilling story spanning 48 hours of five in...,"['pakistani', 'taliban', 'journalist', 'afghan...",6.8,3296.0,15.0,22412.8,40,192


In [92]:
df_final['Genre'].dropna(axis=0)

0                                   ['Comedy', 'Mystery']
1                         ['Action', 'Crime', 'Thriller']
2                                  ['Biography', 'Drama']
3                           ['Action', 'Comedy', 'Drama']
4       ['Adventure', 'Comedy', 'Drama', 'Thriller', '...
                              ...                        
3818           ['Action', 'Drama', 'Mystery', 'Thriller']
3819    ['Action', 'Drama', 'History', 'Musical', 'Rom...
3820         ['Horror', 'Mystery', 'Romance', 'Thriller']
3821                                ['Comedy', 'Romance']
3822                                 ['Drama', 'Romance']
Name: Genre, Length: 3808, dtype: object

#Exporting Final Text Analysis to Final_textAnalysis.csv file

In [93]:
df_final.to_csv('/content/drive/MyDrive/Contelligenz-assignment/Final_textAnalysis.csv')

In [94]:
df_final = pd.read_csv('/content/drive/MyDrive/Contelligenz-assignment/Final_textAnalysis.csv')

In [95]:
type(df_final['Genre'].loc[2769])

str

#Converting pandas series to list to perform Exploratory data analysis.

In [96]:
cast_list = df_clean['Cast'].to_numpy().tolist()
len(cast_list)

3823

Converting list of lists to a single list , to find duplicates in overall list

In [97]:
def flattenList(lists):
  flat_list = []
  for element in lists:
    if type(element) is list:
      for item in element:
              flat_list.append(item)
    else:
      flat_list.append(element)
  return flat_list



In [98]:
cast_flat_list = flattenList(cast_list)
(cast_flat_list)

["['Tusshar Kapoor', 'Riteish Deshmukh', 'Isha Koppikar', 'Neha Dhupia', 'Anupam Kher', 'Shoma Anand', 'Razak Khan', 'Avtar Gill', 'Anil Nagrath', 'Rajpal Yadav', 'Bobby Darling', 'Jay Sean', 'Rishi Rich', 'Victoria', 'Juggy D.', 'Raj Zutshi', 'Rana Jung Bahadur', 'Dinyar Tirandaz', 'Roshan Tirandaz', 'Vijay Patkar', 'Sophiya Chaudhary', 'Amit Divatia', 'Dinesh Hingoo', 'Neena Kulkarni', 'Johnny Lever', 'Sushmita Mukherjee', 'Shree Rammy Pandey', 'Rannvijay Singh']",
 "['Ajay Devgn', 'Shahid Kapoor', 'Prabhas', 'Yami Gautam', 'Sonakshi Sinha', 'Prabhu Deva', 'Byron Gibson', 'Rocky Verma', 'Razak Khan', 'Shaji Chaudhary', 'Kunaal Roy Kapur', 'Manasvi Mamgai', 'Puru Rajkumar', 'Jeetu Verma', 'Aaron Brumfield', 'Pradeep Kabra', 'Damian Mavis', 'Ganesh Yadav', 'Rajesh Khattar', 'Ketan Karande', 'Anandraj', 'Sulabha Arya', 'Nalneesh Neel', 'Aarti Puri', 'Nirmal Soni', 'Shawar Ali', 'Nalneesh', 'Pramod Sharma', 'Pramod Sharma']",
 "['Nawazuddin Siddiqui', 'Rasika Dugal', 'Tahir Raj Bhasin', 

Finding duplicate value counts in the list.

In [99]:
def find_duplicated(x):
  duplicate_dict={} 
  duplicate_dict = {i:x.count(i) for i in x}
  return duplicate_dict

In [100]:
cast_duplicate_dict = find_duplicated(cast_flat_list)
cast_duplicate_dict

{"['Tusshar Kapoor', 'Riteish Deshmukh', 'Isha Koppikar', 'Neha Dhupia', 'Anupam Kher', 'Shoma Anand', 'Razak Khan', 'Avtar Gill', 'Anil Nagrath', 'Rajpal Yadav', 'Bobby Darling', 'Jay Sean', 'Rishi Rich', 'Victoria', 'Juggy D.', 'Raj Zutshi', 'Rana Jung Bahadur', 'Dinyar Tirandaz', 'Roshan Tirandaz', 'Vijay Patkar', 'Sophiya Chaudhary', 'Amit Divatia', 'Dinesh Hingoo', 'Neena Kulkarni', 'Johnny Lever', 'Sushmita Mukherjee', 'Shree Rammy Pandey', 'Rannvijay Singh']": 1,
 "['Ajay Devgn', 'Shahid Kapoor', 'Prabhas', 'Yami Gautam', 'Sonakshi Sinha', 'Prabhu Deva', 'Byron Gibson', 'Rocky Verma', 'Razak Khan', 'Shaji Chaudhary', 'Kunaal Roy Kapur', 'Manasvi Mamgai', 'Puru Rajkumar', 'Jeetu Verma', 'Aaron Brumfield', 'Pradeep Kabra', 'Damian Mavis', 'Ganesh Yadav', 'Rajesh Khattar', 'Ketan Karande', 'Anandraj', 'Sulabha Arya', 'Nalneesh Neel', 'Aarti Puri', 'Nirmal Soni', 'Shawar Ali', 'Nalneesh', 'Pramod Sharma', 'Pramod Sharma']": 1,
 "['Nawazuddin Siddiqui', 'Rasika Dugal', 'Tahir Raj Bha

#Finding top 10 Actors

In [101]:
d = Counter(cast_duplicate_dict)
cast_top_10 = d.most_common(10)
df_cast_top_10 = pd.DataFrame(data =cast_top_10, columns= ['Actor', 'Number of movies'])

In [102]:
df_cast_top_10.to_csv('/content/drive/MyDrive/Contelligenz-assignment/top_10_actors.csv')

#Finding Genre distribution of titles

In [103]:
df_genre_dist = df_clean[['Title','Genre']]

In [104]:
type(df_genre_dist['Genre'].loc[2768])

str

In [105]:
df_genre_dist['Genre_list'] = df_genre_dist['Genre'].apply (lambda x: strToList(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [106]:
df_genre_dist['Genre_list'] = df_genre_dist['Genre_list'].apply (lambda x: list(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [107]:
type(df_genre_dist['Genre_list'].loc[2669])

list

In [108]:
genre_list = df_genre_dist['Genre_list'].to_numpy().tolist()

In [109]:
type(genre_list)

list

In [110]:
genre_flat_list = flattenList(genre_list)
len(genre_flat_list)

9734

In [111]:
genre_duplicate_dict = find_duplicated(genre_flat_list)
(genre_duplicate_dict)

{'Action': 1334,
 'Adventure': 175,
 'Animation': 14,
 'Biography': 58,
 'Comedy': 1055,
 'Crime': 682,
 'Documentary': 4,
 'Drama': 2763,
 'Family': 356,
 'Fantasy': 110,
 'History': 54,
 'Horror': 111,
 'Music': 37,
 'Musical': 410,
 'Mystery': 220,
 'Reality-TV': 1,
 'Romance': 1387,
 'Sci-Fi': 33,
 'Short': 4,
 'Sport': 48,
 'Thriller': 832,
 'War': 40,
 'Western': 6}

In [112]:
d = Counter(genre_duplicate_dict)
genre_top = d.most_common()
genre_top

[('Drama', 2763),
 ('Romance', 1387),
 ('Action', 1334),
 ('Comedy', 1055),
 ('Thriller', 832),
 ('Crime', 682),
 ('Musical', 410),
 ('Family', 356),
 ('Mystery', 220),
 ('Adventure', 175),
 ('Horror', 111),
 ('Fantasy', 110),
 ('Biography', 58),
 ('History', 54),
 ('Sport', 48),
 ('War', 40),
 ('Music', 37),
 ('Sci-Fi', 33),
 ('Animation', 14),
 ('Western', 6),
 ('Short', 4),
 ('Documentary', 4),
 ('Reality-TV', 1)]

In [113]:
df_final_genre = pd.DataFrame(data = genre_top, columns= ['Genres', 'Number of movies'])

In [114]:
df_final_genre.head()

Unnamed: 0,Genres,Number of movies
0,Drama,2763
1,Romance,1387
2,Action,1334
3,Comedy,1055
4,Thriller,832


In [171]:
total_genres_list = df_genre_dist['Genre_list'].to_numpy().tolist()
type(total_genres_list)

list

In [172]:
genres_no_duplicate = df_final_genre['Genres'].to_numpy().tolist()
genres_no_duplicate

['Drama',
 'Romance',
 'Action',
 'Comedy',
 'Thriller',
 'Crime',
 'Musical',
 'Family',
 'Mystery',
 'Adventure',
 'Horror',
 'Fantasy',
 'Biography',
 'History',
 'Sport',
 'War',
 'Music',
 'Sci-Fi',
 'Animation',
 'Western',
 'Short',
 'Documentary',
 'Reality-TV']

In [116]:
del df_genre_dist['Genre']

In [173]:
def detectGenre(genres_list, genre_search):
  #print(genres_list, genre_search)
  return genre_search in genres_list


In [203]:
total_list = []


In [204]:
for i in genres_no_duplicate:
  ind_list = []
  for j in range(len(total_genres_list)):
    if detectGenre(total_genres_list[j],i):
      ind_list.append(df_final['Title'].loc[j])
  total_list.append(ind_list)



In [206]:
len(total_list[0])

2763

In [207]:
df_final_genre['Genre_distribution_of_title'] = total_list

In [208]:
df_final_genre.head()

Unnamed: 0,Genres,Number of movies,Genre_distribution_of_title
0,Drama,2763,"[Manto, Pataakha, Kabul Express, Welcome to Sa..."
1,Romance,1387,"[Tum Bin...: Love Will Find a Way, Jhoom Barab..."
2,Action,1334,"[Action Jackson, Pataakha, Chakravyuh, Ten, Gh..."
3,Comedy,1055,"[Kyaa Kool Hai Hum, Pataakha, Kabul Express, W..."
4,Thriller,832,"[Action Jackson, Kabul Express, Chakravyuh, Te..."


In [210]:
df_final_genre.to_csv('/content/drive/MyDrive/Contelligenz-assignment/Genre_distribution_of_titles.csv')