In [77]:
import pandas as pd
import sqlite3 as ss
import matplotlib.pyplot as plt
import os

## Bollywood_Movies Dataset 

In [78]:
bollywood_path = os.path.join("Downloads/movies_data", "bollywood_movies.sqlite")
con = ss.connect(bollywood_path)
bollywood_df = pd.read_sql_query("SELECT * from Movies ", con)
bollywood_df.info()   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               55 non-null     int64  
 1   MovieID             55 non-null     object 
 2   Title               55 non-null     object 
 3   Director            55 non-null     object 
 4   Genre               54 non-null     object 
 5   ReleaseYear         55 non-null     int64  
 6   Budget (Crores)     43 non-null     float64
 7   BoxOffice (Crores)  46 non-null     float64
 8   Rating              55 non-null     float64
 9   Duration (minutes)  55 non-null     int64  
 10  LeadActor           55 non-null     object 
 11  LeadActress         55 non-null     object 
 12  Language            55 non-null     object 
 13  ProductionCompany   55 non-null     object 
dtypes: float64(3), int64(3), object(8)
memory usage: 6.1+ KB


In [79]:
bollywood_df.shape


(55, 14)

In [80]:
bollywood_df         


Unnamed: 0,index,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV001,Lagaan,Ashutosh Gowariker,,2001,,100.0,8.1,224,Aamir Khan,Gracy Singh,Hindi,Aamir Khan Productions
1,1,MOV002,Kabhi Khushi Kabhie Gham...,Karan Johar,Family Drama,2001,50.0,100.0,7.9,210,Shah Rukh Khan,Kajol,Hindi,Dharma Productions
2,2,MOV003,M.S. Dhoni: The Untold Story,Neeraj Pandey,Biographical Sports Drama,2016,,215.0,7.9,184,Sushant Singh Rajput,Kiara Advani,Hindi,Fox Star Studios
3,3,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2017,250.0,1810.0,8.2,171,Prabhas,Anushka Shetty,Telugu,Arka Media Works
4,4,MOV005,Chennai Express,Rohit Shetty,Action Comedy,2013,,423.0,6.4,141,Shah Rukh Khan,Deepika Padukone,Hindi,Red Chillies Entertainment
5,5,MOV006,Bharat,Ali Abbas Zafar,Drama,2019,100.0,325.0,6.8,156,Salman Khan,Katrina Kaif,Hindi,Reel Life Productions
6,6,MOV007,Dangal,Nitesh Tiwari,Biographical Sports Drama,2016,70.0,2140.0,8.1,161,Aamir Khan,Fatima Sana Shaikh,Hindi,Aamir Khan Productions
7,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005,,,7.5,173,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
8,8,MOV009,PK,Rajkumar Hirani,Satirical Science Fiction Comedy,2014,85.0,792.0,8.1,153,Aamir Khan,Anushka Sharma,Hindi,Vinod Chopra Films
9,9,MOV010,2.0 (Tamil),S. Shankar,Science Fiction Action,2018,550.0,800.0,6.9,147,Rajinikanth,Amy Jackson,Tamil,Lyca Productions


In [81]:
bollywood_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               55 non-null     int64  
 1   MovieID             55 non-null     object 
 2   Title               55 non-null     object 
 3   Director            55 non-null     object 
 4   Genre               54 non-null     object 
 5   ReleaseYear         55 non-null     int64  
 6   Budget (Crores)     43 non-null     float64
 7   BoxOffice (Crores)  46 non-null     float64
 8   Rating              55 non-null     float64
 9   Duration (minutes)  55 non-null     int64  
 10  LeadActor           55 non-null     object 
 11  LeadActress         55 non-null     object 
 12  Language            55 non-null     object 
 13  ProductionCompany   55 non-null     object 
dtypes: float64(3), int64(3), object(8)
memory usage: 6.1+ KB


### Handling Missing Values
From the info we see that null values are present in the following columns:
- `Genre`
- `Budget (Crores)`
- `BoxOffice (Crores)`
#### Cleaning Strategy:
- For `Genre` since its object we can replace nulls with `'Unknown'`.
- For `Budget (Crores)` and `BoxOffice (Crores)` since it is integer,fill nulls with the mean of the respective columns.


In [82]:
'''
Filling the genre column null values with "Unknown 
'''

bollywood_df['Genre'] = bollywood_df['Genre'].fillna('Unknown')
bollywood_df.head(10)
bollywood_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               55 non-null     int64  
 1   MovieID             55 non-null     object 
 2   Title               55 non-null     object 
 3   Director            55 non-null     object 
 4   Genre               55 non-null     object 
 5   ReleaseYear         55 non-null     int64  
 6   Budget (Crores)     43 non-null     float64
 7   BoxOffice (Crores)  46 non-null     float64
 8   Rating              55 non-null     float64
 9   Duration (minutes)  55 non-null     int64  
 10  LeadActor           55 non-null     object 
 11  LeadActress         55 non-null     object 
 12  Language            55 non-null     object 
 13  ProductionCompany   55 non-null     object 
dtypes: float64(3), int64(3), object(8)
memory usage: 6.1+ KB


In [83]:
"""
Clean column name by removing leading/trailing whitespaes
"""
bollywood_df.columns = bollywood_df.columns.str.strip()
bollywood_df.head()

Unnamed: 0,index,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV001,Lagaan,Ashutosh Gowariker,Unknown,2001,,100.0,8.1,224,Aamir Khan,Gracy Singh,Hindi,Aamir Khan Productions
1,1,MOV002,Kabhi Khushi Kabhie Gham...,Karan Johar,Family Drama,2001,50.0,100.0,7.9,210,Shah Rukh Khan,Kajol,Hindi,Dharma Productions
2,2,MOV003,M.S. Dhoni: The Untold Story,Neeraj Pandey,Biographical Sports Drama,2016,,215.0,7.9,184,Sushant Singh Rajput,Kiara Advani,Hindi,Fox Star Studios
3,3,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2017,250.0,1810.0,8.2,171,Prabhas,Anushka Shetty,Telugu,Arka Media Works
4,4,MOV005,Chennai Express,Rohit Shetty,Action Comedy,2013,,423.0,6.4,141,Shah Rukh Khan,Deepika Padukone,Hindi,Red Chillies Entertainment


In [84]:
bollywood_df[bollywood_df['Budget (Crores)'].isnull()]

Unnamed: 0,index,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV001,Lagaan,Ashutosh Gowariker,Unknown,2001,,100.0,8.1,224,Aamir Khan,Gracy Singh,Hindi,Aamir Khan Productions
2,2,MOV003,M.S. Dhoni: The Untold Story,Neeraj Pandey,Biographical Sports Drama,2016,,215.0,7.9,184,Sushant Singh Rajput,Kiara Advani,Hindi,Fox Star Studios
4,4,MOV005,Chennai Express,Rohit Shetty,Action Comedy,2013,,423.0,6.4,141,Shah Rukh Khan,Deepika Padukone,Hindi,Red Chillies Entertainment
7,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005,,,7.5,173,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
11,11,MOV012,Dilwale Dulhania Le Jayenge,Aditya Chopra,Romance,1995,,,7.7,189,Shah Rukh Khan,Kajol,Hindi,Yash Raj Films
23,23,MOV024,Avengers: Endgame (Dubbed),"Anthony Russo, Joseph Russo",Superhero,2019,,,8.4,181,Robert Downey Jr.,Scarlett Johansson,"English (Dubbed in Telugu, Kannada, Hindi)",Marvel Studios
32,32,MOV033,96 (Kannada Dubbed),C. Premkumar,Romantic Drama,2019,,,7.9,158,Vijay Sethupathi,Trisha,Kannada (Dubbed from Tamil),Madras Enterprises
34,34,MOV035,Dabangg 3 (Dubbed),Prabhu Deva,Action Comedy,2019,,,5.8,159,Salman Khan,Sonakshi Sinha,Telugu (Dubbed from Hindi),S. K. Film Enterprises
36,36,MOV037,Shylock (Kannada Dubbed),Ajai Vasudev,Action Thriller,2020,,,6.8,164,Mammootty,Rajkiran,Kannada (Dubbed from Malayalam),Vrindaavan Films
37,37,MOV038,Love Aaj Kal (Dubbed),Imtiaz Ali,Romantic Drama,2020,,,4.4,141,Kartik Aaryan,Sara Ali Khan,Telugu (Dubbed from Hindi),Reliance Entertainment


In [85]:
"""
Filling the NaN columns in the Budget Column with mean of the column
"""
mean_budget = bollywood_df['Budget (Crores)'].mean()
bollywood_df['Budget (Crores)'] = bollywood_df['Budget (Crores)'].fillna(mean_budget)
bollywood_df.head()

Unnamed: 0,index,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV001,Lagaan,Ashutosh Gowariker,Unknown,2001,91.511628,100.0,8.1,224,Aamir Khan,Gracy Singh,Hindi,Aamir Khan Productions
1,1,MOV002,Kabhi Khushi Kabhie Gham...,Karan Johar,Family Drama,2001,50.0,100.0,7.9,210,Shah Rukh Khan,Kajol,Hindi,Dharma Productions
2,2,MOV003,M.S. Dhoni: The Untold Story,Neeraj Pandey,Biographical Sports Drama,2016,91.511628,215.0,7.9,184,Sushant Singh Rajput,Kiara Advani,Hindi,Fox Star Studios
3,3,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2017,250.0,1810.0,8.2,171,Prabhas,Anushka Shetty,Telugu,Arka Media Works
4,4,MOV005,Chennai Express,Rohit Shetty,Action Comedy,2013,91.511628,423.0,6.4,141,Shah Rukh Khan,Deepika Padukone,Hindi,Red Chillies Entertainment


In [86]:
bollywood_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               55 non-null     int64  
 1   MovieID             55 non-null     object 
 2   Title               55 non-null     object 
 3   Director            55 non-null     object 
 4   Genre               55 non-null     object 
 5   ReleaseYear         55 non-null     int64  
 6   Budget (Crores)     55 non-null     float64
 7   BoxOffice (Crores)  46 non-null     float64
 8   Rating              55 non-null     float64
 9   Duration (minutes)  55 non-null     int64  
 10  LeadActor           55 non-null     object 
 11  LeadActress         55 non-null     object 
 12  Language            55 non-null     object 
 13  ProductionCompany   55 non-null     object 
dtypes: float64(3), int64(3), object(8)
memory usage: 6.1+ KB


In [87]:
"""
Filling the NaN columns in the BoxOffice Column with mean of the column
"""
mean_boxoffice = bollywood_df['BoxOffice (Crores)'].mean()
bollywood_df['BoxOffice (Crores)'] = bollywood_df['BoxOffice (Crores)'].fillna(mean_boxoffice)
bollywood_df.head()

Unnamed: 0,index,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV001,Lagaan,Ashutosh Gowariker,Unknown,2001,91.511628,100.0,8.1,224,Aamir Khan,Gracy Singh,Hindi,Aamir Khan Productions
1,1,MOV002,Kabhi Khushi Kabhie Gham...,Karan Johar,Family Drama,2001,50.0,100.0,7.9,210,Shah Rukh Khan,Kajol,Hindi,Dharma Productions
2,2,MOV003,M.S. Dhoni: The Untold Story,Neeraj Pandey,Biographical Sports Drama,2016,91.511628,215.0,7.9,184,Sushant Singh Rajput,Kiara Advani,Hindi,Fox Star Studios
3,3,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2017,250.0,1810.0,8.2,171,Prabhas,Anushka Shetty,Telugu,Arka Media Works
4,4,MOV005,Chennai Express,Rohit Shetty,Action Comedy,2013,91.511628,423.0,6.4,141,Shah Rukh Khan,Deepika Padukone,Hindi,Red Chillies Entertainment


In [88]:
bollywood_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               55 non-null     int64  
 1   MovieID             55 non-null     object 
 2   Title               55 non-null     object 
 3   Director            55 non-null     object 
 4   Genre               55 non-null     object 
 5   ReleaseYear         55 non-null     int64  
 6   Budget (Crores)     55 non-null     float64
 7   BoxOffice (Crores)  55 non-null     float64
 8   Rating              55 non-null     float64
 9   Duration (minutes)  55 non-null     int64  
 10  LeadActor           55 non-null     object 
 11  LeadActress         55 non-null     object 
 12  Language            55 non-null     object 
 13  ProductionCompany   55 non-null     object 
dtypes: float64(3), int64(3), object(8)
memory usage: 6.1+ KB


In [89]:
"""
Dropping duplicate rows
"""
bollywood_df = bollywood_df.drop_duplicates()
bollywood_df.shape

(55, 14)

## Tollywood_movies Dataset

In [100]:
pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [101]:
tollywood_df = pd.read_excel('Downloads/movies_data/tollywood_movies.xlsx')
tollywood_df

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,,250,1810,8.2,171.0,Prabhas,Anushka Shetty,Telugu,Arka Media Works
1,1,MOV021,Bahubali: The Beginning,S. S. Rajamouli,Epic Fantasy Action,2015.0,180,650,8.1,159.0,Prabhas,Tamannaah,Telugu,
2,2,MOV023,Sye Raa Narasimha Reddy,Surender Reddy,Historical Action,2019.0,200,265,7.1,167.0,Chiranjeevi,Nayanthara,Telugu,Konidela Production Company
3,3,MOV025,Jersey,Gowtam Tinnanuri,Sports Drama,2019.0,20,45,7.8,,Nani,Shraddha Srinath,,Sithara Entertainments
4,4,MOV027,Geetha Govindam,Parasuram,Romantic Comedy,2018.0,10,130,,148.0,Vijay Deverakonda,Rashmika Mandanna,,GA2 Pictures
5,5,MOV029,Dear Comrade,Bharat Kamma,Romantic Drama,2019.0,15,35,7.1,170.0,Vijay Deverakonda,Rashmika Mandanna,,Mythri Movie Makers
6,6,MOV034,Sarileru Neekevvaru,Anil Ravipudi,Action Comedy,,75,260,7.1,169.0,Mahesh Babu,Rashmika Mandanna,Telugu,AK Entertainments
7,7,MOV036,Bheeshma,Venky Kudumula,Romantic Comedy,2020.0,20,50,7.4,145.0,Nithiin,Rashmika Mandanna,Telugu,Sithara Entertainments
8,8,MOV044,Baahubali: The Beginning,S. S. Rajamouli,Epic Fantasy Action,2015.0,180,650,8.1,159.0,Prabhas,Tamannaah,Telugu,Arka Media Works
9,9,MOV052,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2017.0,250,1810,8.2,171.0,Prabhas,Anushka Shetty,Telugu,Arka Media Works


In [102]:
tollywood_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          10 non-null     int64  
 1   MovieID             10 non-null     object 
 2   Title               10 non-null     object 
 3   Director            10 non-null     object 
 4   Genre               10 non-null     object 
 5   ReleaseYear         8 non-null      float64
 6   Budget (Crores)     10 non-null     int64  
 7   BoxOffice (Crores)  10 non-null     int64  
 8   Rating              9 non-null      float64
 9   Duration (minutes)  9 non-null      float64
 10  LeadActor           10 non-null     object 
 11  LeadActress         10 non-null     object 
 12  Language            7 non-null      object 
 13  ProductionCompany   9 non-null      object 
dtypes: float64(3), int64(3), object(8)
memory usage: 1.2+ KB


In [103]:
tollywood_df.shape

(10, 14)

### Handling Missing Values 
From the dataset info we can find null values are present in the following columns:
- `ReleaseYear`
- `Rating`
- `Duration (minutes)`
- `Language`
- `ProductionCompany`

#### Cleaning Strategy:
- For numeric columns `Rating`, `Duration (minutes)`, null values  can be replaced  with the mean of each respective column.
- For numeric columns `ReleaseYear` can be replaced with median value
- Since `Language` is an object column , we will fill null values with telegu since this is a tollywood dataset that deals with telegu movies only.
- `ProductionCompany`which is also a object colun can be filled with 'Unknown'


In [104]:
"""
Dropping duplicate rows
"""
tollywood_df = tollywood_df.drop_duplicates()
tollywood_df.shape

(10, 14)

In [105]:
# Strip column name spaces just in case
df.columns = df.columns.str.strip()

In [110]:
""" 
 Filling Median for the null values of ReleaseYear
"""
tollywood_df['ReleaseYear'] = tollywood_df['ReleaseYear'].fillna(tollywood_df['ReleaseYear'].median())  
tollywood_df.head()


Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2018.5,250,1810,8.2,171.0,Prabhas,Anushka Shetty,Telugu,Arka Media Works
1,1,MOV021,Bahubali: The Beginning,S. S. Rajamouli,Epic Fantasy Action,2015.0,180,650,8.1,159.0,Prabhas,Tamannaah,Telugu,
2,2,MOV023,Sye Raa Narasimha Reddy,Surender Reddy,Historical Action,2019.0,200,265,7.1,167.0,Chiranjeevi,Nayanthara,Telugu,Konidela Production Company
3,3,MOV025,Jersey,Gowtam Tinnanuri,Sports Drama,2019.0,20,45,7.8,,Nani,Shraddha Srinath,,Sithara Entertainments
4,4,MOV027,Geetha Govindam,Parasuram,Romantic Comedy,2018.0,10,130,,148.0,Vijay Deverakonda,Rashmika Mandanna,,GA2 Pictures


In [112]:
"""
 Filling Mean for the null values of Rating.
"""
tollywood_df['Rating'] = tollywood_df['Rating'].fillna(tollywood_df['Rating'].mean()) 
tollywood_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2018.5,250,1810,8.2,171.0,Prabhas,Anushka Shetty,Telugu,Arka Media Works
1,1,MOV021,Bahubali: The Beginning,S. S. Rajamouli,Epic Fantasy Action,2015.0,180,650,8.1,159.0,Prabhas,Tamannaah,Telugu,
2,2,MOV023,Sye Raa Narasimha Reddy,Surender Reddy,Historical Action,2019.0,200,265,7.1,167.0,Chiranjeevi,Nayanthara,Telugu,Konidela Production Company
3,3,MOV025,Jersey,Gowtam Tinnanuri,Sports Drama,2019.0,20,45,7.8,,Nani,Shraddha Srinath,,Sithara Entertainments
4,4,MOV027,Geetha Govindam,Parasuram,Romantic Comedy,2018.0,10,130,7.677778,148.0,Vijay Deverakonda,Rashmika Mandanna,,GA2 Pictures


In [114]:
""" 
Filling Mean for the null values of Duration
"""
tollywood_df['Duration (minutes)'] = tollywood_df['Duration (minutes)'].fillna(tollywood_df['Duration (minutes)'].mean())
tollywood_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2018.5,250,1810,8.2,171.0,Prabhas,Anushka Shetty,Telugu,Arka Media Works
1,1,MOV021,Bahubali: The Beginning,S. S. Rajamouli,Epic Fantasy Action,2015.0,180,650,8.1,159.0,Prabhas,Tamannaah,Telugu,
2,2,MOV023,Sye Raa Narasimha Reddy,Surender Reddy,Historical Action,2019.0,200,265,7.1,167.0,Chiranjeevi,Nayanthara,Telugu,Konidela Production Company
3,3,MOV025,Jersey,Gowtam Tinnanuri,Sports Drama,2019.0,20,45,7.8,162.111111,Nani,Shraddha Srinath,,Sithara Entertainments
4,4,MOV027,Geetha Govindam,Parasuram,Romantic Comedy,2018.0,10,130,7.677778,148.0,Vijay Deverakonda,Rashmika Mandanna,,GA2 Pictures


In [116]:
"""
 Filling Language with 'Telugu' for the null values in Language column since the dataset is tollywood movies dataset
"""
tollywood_df['Language'] = tollywood_df['Language'].fillna('Telugu')  
tollywood_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2018.5,250,1810,8.2,171.0,Prabhas,Anushka Shetty,Telugu,Arka Media Works
1,1,MOV021,Bahubali: The Beginning,S. S. Rajamouli,Epic Fantasy Action,2015.0,180,650,8.1,159.0,Prabhas,Tamannaah,Telugu,
2,2,MOV023,Sye Raa Narasimha Reddy,Surender Reddy,Historical Action,2019.0,200,265,7.1,167.0,Chiranjeevi,Nayanthara,Telugu,Konidela Production Company
3,3,MOV025,Jersey,Gowtam Tinnanuri,Sports Drama,2019.0,20,45,7.8,162.111111,Nani,Shraddha Srinath,Telugu,Sithara Entertainments
4,4,MOV027,Geetha Govindam,Parasuram,Romantic Comedy,2018.0,10,130,7.677778,148.0,Vijay Deverakonda,Rashmika Mandanna,Telugu,GA2 Pictures


In [118]:
 """
 Fill ProductionCompany with 'Unknown'
 """
tollywood_df['ProductionCompany'] = tollywood_df['ProductionCompany'].fillna('Unknown') 
tollywood_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,0,MOV004,Baahubali 2: The Conclusion,S. S. Rajamouli,Epic Fantasy Action,2018.5,250,1810,8.2,171.0,Prabhas,Anushka Shetty,Telugu,Arka Media Works
1,1,MOV021,Bahubali: The Beginning,S. S. Rajamouli,Epic Fantasy Action,2015.0,180,650,8.1,159.0,Prabhas,Tamannaah,Telugu,Unknown
2,2,MOV023,Sye Raa Narasimha Reddy,Surender Reddy,Historical Action,2019.0,200,265,7.1,167.0,Chiranjeevi,Nayanthara,Telugu,Konidela Production Company
3,3,MOV025,Jersey,Gowtam Tinnanuri,Sports Drama,2019.0,20,45,7.8,162.111111,Nani,Shraddha Srinath,Telugu,Sithara Entertainments
4,4,MOV027,Geetha Govindam,Parasuram,Romantic Comedy,2018.0,10,130,7.677778,148.0,Vijay Deverakonda,Rashmika Mandanna,Telugu,GA2 Pictures


In [119]:
tollywood_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          10 non-null     int64  
 1   MovieID             10 non-null     object 
 2   Title               10 non-null     object 
 3   Director            10 non-null     object 
 4   Genre               10 non-null     object 
 5   ReleaseYear         10 non-null     float64
 6   Budget (Crores)     10 non-null     int64  
 7   BoxOffice (Crores)  10 non-null     int64  
 8   Rating              10 non-null     float64
 9   Duration (minutes)  10 non-null     float64
 10  LeadActor           10 non-null     object 
 11  LeadActress         10 non-null     object 
 12  Language            10 non-null     object 
 13  ProductionCompany   10 non-null     object 
dtypes: float64(3), int64(3), object(8)
memory usage: 1.2+ KB


## Indian_movies Dataset


In [130]:
indian_df = pd.read_csv('Downloads/movies_data/indian_movies.csv')
indian_df

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005.0,,,7.5,173.0,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
1,9,MOV010,2.0 (Tamil),S. Shankar,Science Fiction Action,2018.0,550.0,800.0,6.9,147.0,Rajinikanth,Amy Jackson,Tamil,Lyca Productions
2,21,MOV022,K.G.F: Chapter 1,Prashanth Neel,Action Drama,2018.0,50.0,250.0,7.9,156.0,Yash,Srinidhi Shetty,Kannada,Hombale Films
3,23,MOV024,Avengers: Endgame (Dubbed),"Anthony Russo, Joseph Russo",Superhero,2019.0,,,8.4,181.0,Robert Downey Jr.,Scarlett Johansson,"English (Dubbed in Telugu, Kannada, Hindi)",Marvel Studios
4,25,MOV026,Pailwaan,S. Krishna,Sports Action,2019.0,30.0,55.0,6.8,,Sudeep,Aakanksha Singh,Kannada,RRR Motion Pictures
5,27,MOV028,Yajamana,"V. Harikrishna, P. Kumar",Action Drama,2019.0,20.0,40.0,6.3,164.0,Darshan,Rashmika Mandanna,Kannada,Media House Studio
6,29,MOV030,Gentleman,Jadesh Kumar,Action Thriller,2019.0,10.0,20.0,6.1,145.0,Puneeth Rajkumar,Nishvika Naidu,Kannada,Sri Jagadguru Movies
7,30,MOV031,Saaho,Sujeeth,Action Thriller,2019.0,300.0,450.0,,170.0,Prabhas,Shraddha Kapoor,"Telugu (Dubbed in Hindi, Kannada)",UV Creations
8,31,MOV032,Kavaludaari,Hemanth M. Rao,Thriller,2019.0,5.0,10.0,7.4,118.0,Rishi,Anant Nag,Kannada,PNK Productions
9,32,MOV033,96 (Kannada Dubbed),C. Premkumar,Romantic Drama,2019.0,,,7.9,158.0,Vijay Sethupathi,Trisha,Kannada (Dubbed from Tamil),Madras Enterprises


In [131]:
indian_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          18 non-null     int64  
 1   MovieID             18 non-null     object 
 2   Title               18 non-null     object 
 3   Director            18 non-null     object 
 4   Genre               18 non-null     object 
 5   ReleaseYear         17 non-null     float64
 6   Budget (Crores)     10 non-null     float64
 7   BoxOffice (Crores)  10 non-null     float64
 8   Rating              17 non-null     float64
 9   Duration (minutes)  17 non-null     float64
 10  LeadActor           18 non-null     object 
 11  LeadActress         18 non-null     object 
 12  Language            18 non-null     object 
 13  ProductionCompany   18 non-null     object 
dtypes: float64(5), int64(1), object(8)
memory usage: 2.1+ KB


In [132]:
indian_df.shape

(18, 14)

### Handling Missing Values 
- `ReleaseYear`
- `Budget (Crores)`
- `BoxOffice (Crores)`
- `Rating`
- `Duration (minutes)`

#### Cleaning Strategy:
- For all numeric columns (`Budget (Crores)`, `BoxOffice (Crores)`, `Rating`, `Duration (minutes)`), we will fill missing values using the mean and for `ReleaseYear`, we will fill with median:


In [133]:
"""
Dropping duplicate rows
"""
indian_df = indian_df.drop_duplicates()
indian_df.shape

(18, 14)

In [126]:
# Strip column name spaces just in case
df.columns = df.columns.str.strip()

In [135]:
"""
Replain the null values of the ReleaseYear with median
"""
indian_df['ReleaseYear'] = indian_df['ReleaseYear'].fillna(indian_df['ReleaseYear'].median())
indian_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005.0,,,7.5,173.0,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
1,9,MOV010,2.0 (Tamil),S. Shankar,Science Fiction Action,2018.0,550.0,800.0,6.9,147.0,Rajinikanth,Amy Jackson,Tamil,Lyca Productions
2,21,MOV022,K.G.F: Chapter 1,Prashanth Neel,Action Drama,2018.0,50.0,250.0,7.9,156.0,Yash,Srinidhi Shetty,Kannada,Hombale Films
3,23,MOV024,Avengers: Endgame (Dubbed),"Anthony Russo, Joseph Russo",Superhero,2019.0,,,8.4,181.0,Robert Downey Jr.,Scarlett Johansson,"English (Dubbed in Telugu, Kannada, Hindi)",Marvel Studios
4,25,MOV026,Pailwaan,S. Krishna,Sports Action,2019.0,30.0,55.0,6.8,,Sudeep,Aakanksha Singh,Kannada,RRR Motion Pictures


In [136]:
"""
Filling the null values of Budget column with mean of the column 
"""
indian_df['Budget (Crores)'] = indian_df['Budget (Crores)'].fillna(indian_df['Budget (Crores)'].mean())
indian_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005.0,102.1,,7.5,173.0,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
1,9,MOV010,2.0 (Tamil),S. Shankar,Science Fiction Action,2018.0,550.0,800.0,6.9,147.0,Rajinikanth,Amy Jackson,Tamil,Lyca Productions
2,21,MOV022,K.G.F: Chapter 1,Prashanth Neel,Action Drama,2018.0,50.0,250.0,7.9,156.0,Yash,Srinidhi Shetty,Kannada,Hombale Films
3,23,MOV024,Avengers: Endgame (Dubbed),"Anthony Russo, Joseph Russo",Superhero,2019.0,102.1,,8.4,181.0,Robert Downey Jr.,Scarlett Johansson,"English (Dubbed in Telugu, Kannada, Hindi)",Marvel Studios
4,25,MOV026,Pailwaan,S. Krishna,Sports Action,2019.0,30.0,55.0,6.8,,Sudeep,Aakanksha Singh,Kannada,RRR Motion Pictures


In [137]:
"""
Filling the null values of Boxoffice column with mean of the column 
"""
indian_df['BoxOffice (Crores)'] = indian_df['BoxOffice (Crores)'].fillna(indian_df['BoxOffice (Crores)'].mean())
indian_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005.0,102.1,190.0,7.5,173.0,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
1,9,MOV010,2.0 (Tamil),S. Shankar,Science Fiction Action,2018.0,550.0,800.0,6.9,147.0,Rajinikanth,Amy Jackson,Tamil,Lyca Productions
2,21,MOV022,K.G.F: Chapter 1,Prashanth Neel,Action Drama,2018.0,50.0,250.0,7.9,156.0,Yash,Srinidhi Shetty,Kannada,Hombale Films
3,23,MOV024,Avengers: Endgame (Dubbed),"Anthony Russo, Joseph Russo",Superhero,2019.0,102.1,190.0,8.4,181.0,Robert Downey Jr.,Scarlett Johansson,"English (Dubbed in Telugu, Kannada, Hindi)",Marvel Studios
4,25,MOV026,Pailwaan,S. Krishna,Sports Action,2019.0,30.0,55.0,6.8,,Sudeep,Aakanksha Singh,Kannada,RRR Motion Pictures


In [139]:
"""
Filling the null values of the rating column with mean of the column 
"""
indian_df['Rating'] = indian_df['Rating'].fillna(indian_df['Rating'].mean())
indian_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005.0,102.1,190.0,7.5,173.0,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
1,9,MOV010,2.0 (Tamil),S. Shankar,Science Fiction Action,2018.0,550.0,800.0,6.9,147.0,Rajinikanth,Amy Jackson,Tamil,Lyca Productions
2,21,MOV022,K.G.F: Chapter 1,Prashanth Neel,Action Drama,2018.0,50.0,250.0,7.9,156.0,Yash,Srinidhi Shetty,Kannada,Hombale Films
3,23,MOV024,Avengers: Endgame (Dubbed),"Anthony Russo, Joseph Russo",Superhero,2019.0,102.1,190.0,8.4,181.0,Robert Downey Jr.,Scarlett Johansson,"English (Dubbed in Telugu, Kannada, Hindi)",Marvel Studios
4,25,MOV026,Pailwaan,S. Krishna,Sports Action,2019.0,30.0,55.0,6.8,,Sudeep,Aakanksha Singh,Kannada,RRR Motion Pictures


In [140]:
"""
Filling the null values of the duration column with average of durations 
"""
indian_df['Duration (minutes)'] = indian_df['Duration (minutes)'].fillna(indian_df['Duration (minutes)'].mean())
indian_df.head()

Unnamed: 0.1,Unnamed: 0,MovieID,Title,Director,Genre,ReleaseYear,Budget (Crores),BoxOffice (Crores),Rating,Duration (minutes),LeadActor,LeadActress,Language,ProductionCompany
0,7,MOV008,Sarkar (Tamil),A. R. Murugadoss,Action Thriller,2005.0,102.1,190.0,7.5,173.0,Vijay,Keerthy Suresh,Tamil,Kalaignar TV
1,9,MOV010,2.0 (Tamil),S. Shankar,Science Fiction Action,2018.0,550.0,800.0,6.9,147.0,Rajinikanth,Amy Jackson,Tamil,Lyca Productions
2,21,MOV022,K.G.F: Chapter 1,Prashanth Neel,Action Drama,2018.0,50.0,250.0,7.9,156.0,Yash,Srinidhi Shetty,Kannada,Hombale Films
3,23,MOV024,Avengers: Endgame (Dubbed),"Anthony Russo, Joseph Russo",Superhero,2019.0,102.1,190.0,8.4,181.0,Robert Downey Jr.,Scarlett Johansson,"English (Dubbed in Telugu, Kannada, Hindi)",Marvel Studios
4,25,MOV026,Pailwaan,S. Krishna,Sports Action,2019.0,30.0,55.0,6.8,152.647059,Sudeep,Aakanksha Singh,Kannada,RRR Motion Pictures


In [141]:
indian_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          18 non-null     int64  
 1   MovieID             18 non-null     object 
 2   Title               18 non-null     object 
 3   Director            18 non-null     object 
 4   Genre               18 non-null     object 
 5   ReleaseYear         18 non-null     float64
 6   Budget (Crores)     18 non-null     float64
 7   BoxOffice (Crores)  18 non-null     float64
 8   Rating              18 non-null     float64
 9   Duration (minutes)  18 non-null     float64
 10  LeadActor           18 non-null     object 
 11  LeadActress         18 non-null     object 
 12  Language            18 non-null     object 
 13  ProductionCompany   18 non-null     object 
dtypes: float64(5), int64(1), object(8)
memory usage: 2.1+ KB


### Downloading the cleaned Excel Files

In [144]:
bollywood_df.to_csv("cleaned_bollywood.csv", index=False)
tollywood_df.to_csv("cleaned_tollywood.csv", index=False)
indian_df.to_csv("cleaned_indian.csv", index=False)