# <div style="color:purple;display:inline-block;border-radius:5px;background-color:#E6F7FF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:purple;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b> </b> Import Libraries</p></div>


In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import math

rc = {
    "axes.facecolor": "#E6F7FF",
    "figure.facecolor": "#E6F7FF",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}

sns.set(rc=rc)

from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

# <div style="color:purple;display:inline-block;border-radius:5px;background-color:#E6F7FF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:purple;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b> </b>Data Understanding and Exploration</p></div>


<html>
<body> 
    <ol style="list-style-type: disc;">
        <li style="color: maroon; font-size: 18px;">Load and inspect the datasets (movies.csv and ratings.csv)</li>
        <li style="color: maroon; font-size: 18px;">Check for missing values, data types, and basic statistics</li>
        <li style="color: maroon; font-size: 18px;">Explore the distribution of ratings</li>
        <li style="color: maroon; font-size: 18px;">Explore the genres and their distribution</li>
    </ol>
</body>
</html>
 


In [None]:
# Read the CSV file
movies_df = pd.read_csv('/kaggle/input/bengali-movie-dataset/Bengali Movie Dataset/movies.csv')
movies_df.head().style.set_properties(**{'background-color':'royalblue','color':'white','border-color':'#8b8c8c'})

In [None]:
ratings_df = pd.read_csv('/kaggle/input/bengali-movie-dataset/Bengali Movie Dataset/ratings.csv')
ratings_df.head().style.set_properties(**{'background-color':'orange','color':'white','border-color':'#8b8c8c'})

In [None]:
# Check for missing values
movies_df.isnull().sum()
ratings_df.isnull().sum()

In [None]:
# Explore the distribution of ratings
plt.figure(figsize=(10, 5))
#sns.set_style("whitegrid")
sns.histplot(ratings_df['rating'], bins=30, kde=True, color='red')
plt.title('Distribution of Ratings', fontsize=16, fontweight = 'bold', color = 'darkgreen')
plt.xlabel('Rating', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.ylabel('Count', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.savefig('Distribution of Ratings.png')
plt.show()

In [None]:
# Analyze missing values in movies_df and ratings_df
movies_missing = movies_df.isnull().sum()
ratings_missing = ratings_df.isnull().sum()

# Analyze the popularity of different genres
genres_count = movies_df['genres'].value_counts().head(10)

print("Missing Values in movies_df:")
print(movies_missing)
print("\nMissing Values in ratings_df:")
print(ratings_missing)
print("\nTop 10 Movie Genres:")
print(genres_count)

### Observation:

<html>
<body> 
    <ol style="list-style-type: disc;">
        <li style="color: Fuchsia; font-size: 18px;">There is a missing values in movies datasets and no missing values for ratings dataset.</li>
        <li style="color: Fuchsia; font-size: 18px;">The distribution of ratings is right-skewed, with most ratings around 4.0.</li>
        <li style="color: Fuchsia; font-size: 18px;">The top 10 movie genres and their respective counts are printed.</li>
    </ol>
</body>
</html>



# <div style="color:purple;display:inline-block;border-radius:5px;background-color:#E6F7FF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:purple;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b> </b>Data Preprocessing</p></div>

<html>
<body> 
    <ol style="list-style-type: disc;">
        <li style="color: maroon; font-size: 18px;">Merge datasets on 'movieId' to create a unified dataset</li>
        <li style="color: maroon; font-size: 18px;">Handle any missing or erroneous values (if any)</li>
        <li style="color: maroon; font-size: 18px;">Perform data type conversions if necessary</li>
        <li style="color: maroon; font-size: 18px;">Encode categorical variables (e.g., platform_Name, genres)</li>
    </ol>
</body>
</html>


In [None]:
# Merge datasets on 'movieId'
merged_df = pd.merge(ratings_df, movies_df, on='movieId')

# Encode categorical variables (platform_Name and genres)
merged_df = pd.get_dummies(merged_df, columns=['platform_Name', 'genres'], prefix=['platform', 'genre'])

# Check the merged dataset
print("Merged Dataset:")
merged_df.head()


### Observation:
<html>
<body> 
    <ol style="list-style-type: disc;">
        <li style="color: Fuchsia; font-size: 18px;">We've merged the ratings and movies datasets on 'movieId' to create a unified dataset.</li>
        <li style="color: Fuchsia; font-size: 18px;">We've encoded categorical variables ('platform_Name' and 'genres') into binary columns.</li>
    </ol>
</body>
</html>
 

# <div style="color:purple;display:inline-block;border-radius:5px;background-color:#E6F7FF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:purple;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b> </b>Exploratory Data Analysis (EDA)</p></div>

<html>
<body> 
    <ol style="list-style-type: disc;">
        <li style="color: maroon; font-size: 18px;">Visualize the distribution of ratings</li>
        <li style="color: maroon; font-size: 18px;">Analyze the popularity of different genres</li>
        <li style="color: maroon; font-size: 18px;">Explore the distribution of movies across different platforms</li>
        <li style="color: maroon; font-size: 18px;">Identify the most popular directors and starring actors/actresses</li>
    </ol>
</body>
</html>

In [None]:
# Visualize the distribution of ratings with platform comparison
plt.figure(figsize=(12, 6))
#sns.set_style("whitegrid")
sns.histplot(data=merged_df, x='rating', hue='platform_Chorki', bins=30, kde=True, palette='Set1')
plt.title('Distribution of Ratings by Platform (Chorki)', fontsize=16, fontweight = 'bold', color = 'darkgreen')
plt.xlabel('Rating', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.ylabel('Count', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.legend(title='Platform', labels=['Chorki'])
plt.savefig('Distribution of Ratings by Platform (Chorki).png')
plt.show()

In [None]:
# Analyze the popularity of different genres
genre_counts = movies_df['genres'].value_counts().head(10)

plt.figure(figsize=(12, 6))
#sns.set_style("whitegrid")
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='viridis')
plt.title('Popularity of Movie Genres', fontsize=16, fontweight = 'bold', color = 'darkgreen')
plt.xlabel('Count', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.ylabel('Genre', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.savefig('Popularity of Movie Genres.png')
plt.show()

In [None]:
# Explore the distribution of movies across different platforms
platform_counts = merged_df['platform_Chorki'].value_counts()

plt.figure(figsize=(8, 4))
#sns.set_style("whitegrid")
sns.barplot(x=platform_counts.index, y=platform_counts.values, palette='pastel')
plt.title('Distribution of Movies on Chorki Platform', fontsize=14, fontweight = 'bold', color = 'darkgreen')
plt.xlabel('Platform', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.ylabel('Count', fontsize=12, fontweight = 'bold', color = 'darkblue')
plt.savefig('Distribution of Movies on Chorki Platform.png')
plt.show()


# <div style="color:purple;display:inline-block;border-radius:5px;background-color:#E6F7FF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:purple;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b> </b>Descriptive Statistics</p></div>

<html>
<body> 
    <ol style="list-style-type: disc;">
        <li style="color: maroon; font-size: 18px;">Calculate summary statistics for ratings (mean, median, standard deviation, etc.)</li>
        <li style="color: maroon; font-size: 18px;">Identify the highest and lowest rated movies</li>
        <li style="color: maroon; font-size: 18px;">Analyze the frequency of ratings</li>
    </ol>
</body>
</html> 


In [None]:
# Calculate summary statistics for ratings
rating_stats = merged_df['rating'].describe()

# Identify the highest and lowest rated movies
top_rated_movies = merged_df.groupby('title')['rating'].mean().nlargest(5)
lowest_rated_movies = merged_df.groupby('title')['rating'].mean().nsmallest(5)

# Combine the results into a single DataFrame
summary_df = pd.DataFrame({
    'Summary Statistics for Ratings': rating_stats,
    'Top 5 Highest Rated Movies': top_rated_movies,
    'Top 5 Lowest Rated Movies': lowest_rated_movies
})

# Define a style function to apply color to the table
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: lightgreen' if v else '' for v in is_max]

styled_summary = summary_df.style.apply(highlight_max)

# Render the styled table as HTML
styled_summary = styled_summary.set_table_styles([
    {'selector': 'th', 'props': 'background-color: #f2f2f2; font-weight: bold;'},
    {'selector': 'td', 'props': 'font-size: 12px;'},
])

# Display the table
styled_summary


# <div style="color:purple;display:inline-block;border-radius:5px;background-color:#E6F7FF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:purple;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b> </b>User Behavior Analysis</p></div>

<html>
<body> 
    <ol style="list-style-type: disc;">
        <li style="color: maroon; font-size: 18px;">Analyze user behavior (e.g., most active users, average number of ratings per user)</li>
        <li style="color: maroon; font-size: 18px;">Identify user preferences based on genres, directors, or actors/actresses</li>
    </ol>
</body>
</html>


In [None]:
# Calculate the average number of ratings per user
average_ratings_per_user = merged_df.groupby('userId')['rating'].count().mean()

# Identify the most active users (top 5)
active_users = merged_df['userId'].value_counts().head(5)

# Combine the results into a single DataFrame
user_stats_df = pd.DataFrame({
    'Average Ratings per User': [average_ratings_per_user],
    'Top 5 Most Active Users': [', '.join(active_users.index.astype(str))]
})

# Define a style function to apply color to the table
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

styled_user_stats = user_stats_df.style.apply(highlight_max)

# Render the styled table as HTML
styled_user_stats = styled_user_stats.set_table_styles([
    {'selector': 'th', 'props': 'background-color: #f2f2f2; font-weight: bold;'},
    {'selector': 'td', 'props': 'font-size: 12px;'},
])

# Display the table
styled_user_stats


In [None]:
# Create visualizations for key findings

# For example, let's create a heatmap to visualize the correlation between ratings and genres
plt.figure(figsize=(10, 6))
#sns.set_style("whitegrid")
ratings_genres_corr = merged_df[['rating', 'genre_DRAMA', 'genre_ROMANTIC THRILLER']].corr()
sns.heatmap(ratings_genres_corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation between Ratings and Genres', fontsize=16, fontweight = 'bold', color = 'darkgreen')
plt.savefig('Correlation between Ratings and Genres.png')
plt.show()

<div class="alert alert-block alert-info"> 📌 "Take some time to explore and create a notebook based on your insights. Your contributions offer valuable perspectives. If you find the dataset interesting, an upvote would be greatly appreciated. Your support encourages collaboration and knowledge sharing. Thank you!"😊 </div>