In [9]:
# Load the dataset
import pandas as pd
file_path = '/content/IMdB_India_Top250.csv'
df = pd.read_csv(file_path)

#1. Data Inspection
a. Load the dataset into a pandas DataFrame and display the first 10 rows.

b. Use the info() function to get an overview of the dataset, including data types and non-null counts.

c. Display the summary statistics for numerical columns using describe().

In [10]:
# a. Display the first 10 rows
print(df.head(10))

# b. Overview of the dataset
df.info()

# c. Summary statistics for numerical columns
print(df.describe())

          Movie name  Year of release        Watch  hour   Rating Ratedby  \
0          12th Fail             2023  2 hours 27 minutes     8.9    126K   
1           Gol Maal             1979            2 hours      8.5     20K   
2           Maharaja             2024  2 hours 30 minutes     8.6     37K   
3            Nayakan             1987  2 hours 25 minutes     8.7     25K   
4   The World of Apu             1959   1 hour 45 minutes     8.4     17K   
5         Anbe Sivam             2003  2 hours 40 minutes     8.6     26K   
6  Pariyerum Perumal             2018  2 hours 34 minutes     8.7     19K   
7           3 Idiots             2009  2 hours 50 minutes     8.4    441K   
8              #Home             2021  2 hours 38 minutes     8.8     16K   
9       Black Friday             2004  2 hours 23 minutes     8.4     22K   

           Film Industry                   Genre              Director  \
0      Bollywood (Hindi)        Drama, Biography    Vidhu Vinod Chopra   
1   

#2. Data Cleaning
a. Identify and handle any missing values in the dataset. Drop rows or fill missing values as appropriate.

b. Remove any duplicate rows from the dataset. Ensure that the columns related to rankings, years, or ratings are in the correct numerical format.



In [11]:
# a. Identify and handle missing values
print(df.isnull().sum())

# No missing values found, so no further action required.

# b. Remove duplicate rows and ensure numeric columns
df_cleaned = df.drop_duplicates()
df_cleaned['Rating'] = pd.to_numeric(df_cleaned['Rating'], errors='coerce')
df_cleaned['Year of release'] = pd.to_numeric(df_cleaned['Year of release'], errors='coerce')

Movie name               0
Year of release          0
Watch  hour              0
Rating                   0
Ratedby                  0
Film Industry            0
Genre                    0
Director                 0
Box office collection    0
User reviews             0
Awards                   0
Description              0
Streaming platform       0
dtype: int64


#3. Filtering and Sorting
a. Filter the dataset to display only movies released after the year 2000.

b. Sort the DataFrame by IMDB Rating in descending order.

c. Create a new DataFrame that contains only movies with an IMDB rating greater than 8.5

In [12]:
# a. Filter movies released after 2000
movies_after_2000 = df_cleaned[df_cleaned['Year of release'] > 2000]
print(movies_after_2000.head())

# b. Sort by IMDB Rating in descending order
sorted_by_rating = df_cleaned.sort_values(by='Rating', ascending=False)
print(sorted_by_rating.head())

# c. Create a new DataFrame with movies rated higher than 8.5
high_rated_movies = df_cleaned[df_cleaned['Rating'] > 8.5]
print(high_rated_movies.head())

          Movie name  Year of release        Watch  hour   Rating Ratedby  \
0          12th Fail             2023  2 hours 27 minutes     8.9    126K   
2           Maharaja             2024  2 hours 30 minutes     8.6     37K   
5         Anbe Sivam             2003  2 hours 40 minutes     8.6     26K   
6  Pariyerum Perumal             2018  2 hours 34 minutes     8.7     19K   
7           3 Idiots             2009  2 hours 50 minutes     8.4    441K   

       Film Industry             Genre             Director  \
0  Bollywood (Hindi)  Drama, Biography   Vidhu Vinod Chopra   
2  Kollywood (Tamil)      Crime, Drama  Nithilan Saminathan   
5  Kollywood (Tamil)     Drama, Comedy            Sundar C.   
6  Kollywood (Tamil)     Drama, Social        Mari Selvaraj   
7  Bollywood (Hindi)     Comedy, Drama      Rajkumar Hirani   

  Box office collection  User reviews                    Awards  \
0           $138,288.00           945  23 wins & 32 nominations   
2           $975,543.00 

#4. Group By Operations
a. Group the movies by the Year column and calculate the average IMDB rating for each year.

b. Group the data by Director and count the number of movies each director has in the top 250.

c. Find the highest-rated movie for each year by grouping the data by Year and selecting the movie with the highest rating in each group.

In [13]:
# a. Group by Year of release and calculate average IMDB rating
avg_rating_per_year = df_cleaned.groupby('Year of release')['Rating'].mean()
print(avg_rating_per_year)

# b. Group by Director and count number of movies in top 250
movies_per_director = df_cleaned.groupby('Director').size()
print(movies_per_director)

# c. Find the highest-rated movie for each year
highest_rated_per_year = df_cleaned.loc[df_cleaned.groupby('Year of release')['Rating'].idxmax()]
print(highest_rated_per_year)


Year of release
1955    8.200000
1956    8.200000
1957    8.700000
1958    7.900000
1959    8.400000
1960    8.100000
1964    8.100000
1965    8.300000
1968    8.100000
1970    7.900000
1971    8.100000
1975    8.133333
1979    8.500000
1982    8.300000
1983    8.300000
1987    8.400000
1988    8.250000
1989    8.800000
1991    8.750000
1992    8.300000
1993    8.700000
1994    8.000000
1995    8.250000
1996    8.100000
1997    8.150000
1998    8.300000
1999    8.175000
2000    8.133333
2001    8.200000
2002    8.133333
2003    8.066667
2004    8.071429
2005    8.175000
2006    8.077778
2007    8.020000
2008    8.150000
2009    8.075000
2010    8.142857
2011    8.083333
2012    8.061538
2013    8.063636
2014    8.185714
2015    8.161538
2016    8.207143
2017    8.050000
2018    8.314286
2019    8.193750
2020    8.200000
2021    8.323077
2022    8.257143
2023    8.250000
2024    8.225000
Name: Rating, dtype: float64
Director
A.L. Vijay              1
A.R. Murugadoss         1
Aamir Khan

#5. Column Creation and Manipulation
a. Create a new column named Rating Category that categorizes movies as "Excellent" if the IMDB rating is 9.0 or above, "Good" if between 8.0 and 9.0, and "Average" if below 8.0.

b. Extract the Year from the movie title (assuming the year is part of the title) and create a new column Extracted Year.

c. Create a new column that combines the Title and Director into a single string, separated by a hyphen.

In [14]:
# a. Create a Rating Category column
df_cleaned['Rating Category'] = pd.cut(df_cleaned['Rating'], bins=[0, 8.0, 9.0, 10.0], labels=['Average', 'Good', 'Excellent'])
print(df_cleaned[['Movie name', 'Rating', 'Rating Category']].head())

# b. Extract Year from the title (Skipping since year is already in the dataset)

# c. Create a column combining 'Title' and 'Director'
df_cleaned['Title-Director'] = df_cleaned['Movie name'] + " - " + df_cleaned['Director']
print(df_cleaned[['Movie name', 'Director', 'Title-Director']].head())

         Movie name  Rating Rating Category
0         12th Fail     8.9            Good
1          Gol Maal     8.5            Good
2          Maharaja     8.6            Good
3           Nayakan     8.7            Good
4  The World of Apu     8.4            Good
         Movie name              Director                   Title-Director
0         12th Fail    Vidhu Vinod Chopra   12th Fail - Vidhu Vinod Chopra
1          Gol Maal  Hrishikesh Mukherjee  Gol Maal - Hrishikesh Mukherjee
2          Maharaja   Nithilan Saminathan   Maharaja - Nithilan Saminathan
3           Nayakan           Mani Ratnam            Nayakan - Mani Ratnam
4  The World of Apu          Satyajit Ray  The World of Apu - Satyajit Ray
