<div style="background-color: #8B0000; border-radius: 2px; border: #000000 solid; padding: 15px; font-size: 100%; text-align: center;">
    <h2 align="center" style="color: #FFFFFF;"><b>Movie Recommendation System Using Netflix Data</b></h2>
</div>

<div style="background-color: #000; border-radius: 2px; border: 1px solid #000; padding: 15px; font-size: 100%; text-align: center;">
    <img src="https://media.tenor.com/NerN41mjgV0AAAAC/netflix-intro.gif" alt="Netflix Intro Animation" style="max-width: 100%; height: auto;">
</div>


In [37]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import save_npz
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd

# Load the dataset
netflix_data = pd.read_csv("dataset/netflix_data.csv")

# Display the first few rows of the dataset
netflix_data.head()


<div style="background-color: #8B0000; border-radius: 2px; border: #000000 solid; padding: 15px; font-size: 100%; text-align: center;">
    <h2 align="center" style="color: #FFFFFF;"><b>Comprehensive Data Pre-Processing and Exploratory Data Analysis</b></h2>
</div>


In [None]:
netflix_data.info()

In [None]:
# Check for missing values in each column
missing_values = netflix_data.isnull().sum()

# Display the number of missing values for each column
print(missing_values)


In [41]:
# Replace all missing values with an empty string
netflix_data.fillna('', inplace=True)


In [None]:
netflix_data.describe(include='all').T

In [None]:
import plotly.graph_objects as go

# Calculate movie counts per year
movie_counts = netflix_data['release_year'].value_counts().sort_index()

# Create a bar chart with advanced customization
fig = go.Figure(data=go.Bar(
    x=movie_counts.index,
    y=movie_counts.values,
    text=movie_counts.values,
    textposition='auto',  # Display the count on top of each bar
    marker=dict(color='rgba(255, 0, 0, 0.6)', line=dict(color='rgba(255, 0, 0, 0.9)', width=1.5))  # Custom colors and border
))

# Update layout with additional customizations
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background for the entire plot
    font=dict(color='white'),  # Font color for text
    title='Number of Movies Released Each Year',  # Title of the plot
    xaxis=dict(
        title='Year',
        title_font=dict(size=14, family='Arial, sans-serif'),  # Customize title font
        tickangle=-45,  # Rotate x-axis labels for better readability
        tickfont=dict(size=12, family='Arial, sans-serif')  # Customize tick font
    ),
    yaxis=dict(
        title='Number of Movies',
        title_font=dict(size=14, family='Arial, sans-serif'),  # Customize title font
        tickfont=dict(size=12, family='Arial, sans-serif')  # Customize tick font
    ),
    margin=dict(l=40, r=40, t=60, b=40)  # Adjust margins for better spacing
)

# Add hover text for each bar
fig.update_traces(
    hovertemplate='Year: %{x}<br>Number of Movies: %{y}<extra></extra>'
)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objects as go

# Calculate counts of each movie type
movie_type_counts = netflix_data['type'].value_counts()

# Create a pie chart with advanced customization
fig = go.Figure(data=go.Pie(
    labels=movie_type_counts.index,
    values=movie_type_counts.values,
    textinfo='label+percent',  # Show both labels and percentages
    insidetextorientation='radial',  # Orientation of text inside the pie
    marker=dict(
        colors=['#FF6F61', '#6B5B95', '#88B04B', '#F7CAC9', '#92A8D1'],  # Custom colors
        line=dict(color='rgb(17, 17, 17)', width=2)  # Border color and width
    ),
    hole=0.3  # Add a hole in the center for a donut chart style (optional)
))

# Update layout with additional customizations
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background for the entire plot
    font=dict(color='white'),  # Font color for text
    title='Distribution of Content Types',  # Title of the plot
    legend=dict(
        orientation='h',  # Horizontal legend
        yanchor='bottom', 
        y=1.1, 
        xanchor='right', 
        x=1
    ),  # Position of the legend
    margin=dict(l=40, r=40, t=60, b=40)  # Adjust margins for better spacing
)

# Add hover text for each slice
fig.update_traces(
    hovertemplate='Type: %{label}<br>Count: %{value}<br>Percentage: %{percent:.2%}<extra></extra>'
)

# Show the plot
fig.show()


In [None]:
import plotly.express as px

# Get the top 10 countries with the highest number of movies
top_countries = netflix_data['country'].value_counts().head(10)

# Create a treemap with a colorful color scale
fig = px.treemap(
    names=top_countries.index,
    parents=["" for _ in top_countries.index],
    values=top_countries.values,
    color=top_countries.values,
    color_continuous_scale='Viridis',  # Colorful color scale
    labels={'names': 'Country', 'values': 'Number of Movies'}
)

# Update layout with additional customizations
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background for the entire plot
    font=dict(color='white'),  # Font color for text
    title='Top 10 Countries with Highest Number of Movies',  # Title of the plot
    margin=dict(l=20, r=20, t=40, b=20)  # Adjust margins for better spacing
)

# Add hover text for each section
fig.update_traces(
    hovertemplate='Country: %{label}<br>Number of Movies: %{value}<extra></extra>'
)

# Show the plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Get movie counts by country
country_movie_counts = netflix_data['country'].value_counts()

# Create DataFrame for plotting
data = pd.DataFrame({'Country': country_movie_counts.index, 'Movie Count': country_movie_counts.values})

# Create a choropleth map with a colorful color scale
fig = px.choropleth(
    data_frame=data, 
    locations='Country', 
    locationmode='country names',
    color='Movie Count',
    title='Number of Movies Released By Country',
    color_continuous_scale='Rainbow',  # Colorful color scale
    range_color=[0, data['Movie Count'].max()],
    labels={'Movie Count': 'Number of Movies'}
)

# Update layout with additional customizations
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background for the entire plot
    font=dict(color='white'),  # Font color for text
    coloraxis_colorbar=dict(
        title='Number of Movies',  # Title of the color bar
        tickvals=[0, data['Movie Count'].max() / 2, data['Movie Count'].max()],
        ticktext=[
            '0', 
            f"{data['Movie Count'].max() / 2:.0f}", 
            f"{data['Movie Count'].max():.0f}"
        ]
    )  # Color bar customization
)

# Add hover text for each country
fig.update_traces(
    hovertemplate='Country: %{location}<br>Number of Movies: %{z}<extra></extra>'
)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objects as go

# Get ratings and their counts
ratings = list(netflix_data['rating'].value_counts().index)
rating_counts = list(netflix_data['rating'].value_counts().values)

# Create the bar chart with additional features
fig = go.Figure(data=[go.Bar(
    x=ratings,
    y=rating_counts,
    marker_color='rgb(255, 69, 58)',  # Slightly different color for better contrast
    text=rating_counts,  # Add text labels
    textposition='outside'  # Position labels outside the bars
)])

# Update layout with additional customizations
fig.update_layout(
    title='Movie Ratings Distribution',
    xaxis_title='Rating',
    yaxis_title='Count',
    plot_bgcolor='rgba(0, 0, 0, 0)',  # Transparent background
    paper_bgcolor='rgba(0, 0, 0, 0.7)',  # Dark background for the entire plot
    font=dict(color='white'),  # Font color for text
    xaxis=dict(
        tickangle=-45,  # Rotate x-axis labels for better readability
        showgrid=True,  # Show grid lines
        gridcolor='rgba(255, 255, 255, 0.2)'  # Light gray grid lines
    ),
    yaxis=dict(
        showgrid=True,  # Show grid lines
        gridcolor='rgba(255, 255, 255, 0.2)'  # Light gray grid lines
    )
)

# Show the plot
fig.show()


In [None]:
ratings       = list(netflix_data['duration'].value_counts().index)
rating_counts = list(netflix_data['duration'].value_counts().values)

fig = go.Figure(data=[go.Bar(
    x=ratings,
    y=rating_counts,
    marker_color='#E50914'
)])

fig.update_layout(
    title='Movie Durations Distribution',
    xaxis_title='Rating',
    yaxis_title='Count',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0.7)',
    font=dict(
        color='white'
    )
)

fig.show()

In [None]:
titles = netflix_data['title'].values

text = ' '.join(titles)

wordcloud = WordCloud(background_color='black', colormap='Reds').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Titles', color='white')
plt.show()

In [None]:
titles = netflix_data['description'].values

text = ' '.join(titles)

wordcloud = WordCloud(background_color='black', colormap='Reds').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Descriptions', color='white')
plt.show()

In [None]:
titles = netflix_data['listed_in'].values

text = ' '.join(titles)

wordcloud = WordCloud(background_color='black', colormap='Reds').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Descriptions', color='white')
plt.show()

In [None]:
netflix_data

<div style="background-color: #8B0000; border-radius: 2px; border: #000000 solid; padding: 15px; font-size: 100%; text-align: center;">
    <h2 align="center" style="color: #FFFFFF;"><b>Advanced Feature Engineering Techniques</b></h2>
</div>


In [53]:
new_data = netflix_data[['title', 'type', 'director', 'cast', 'rating', 'listed_in', 'description']]
new_data.set_index('title', inplace=True)

In [None]:
new_data.head()

In [55]:
class TextCleaner:
    def separate_text(self, texts):
        unique_texts = set()
        for text in texts.split(','):
            unique_texts.add(text.strip().lower())
        return ' '.join(unique_texts)

    def remove_space(self, texts):
        return texts.replace(' ', '').lower()

    def remove_punc(self, texts):
        texts = texts.lower()
        texts = texts.translate(str.maketrans('', '', string.punctuation))
        return ' '.join(texts.split())

    def clean_text(self, texts):
        texts = self.separate_text(texts)
        texts = self.remove_space(texts)
        texts = self.remove_punc(texts)
        return texts

In [56]:
cleaner = TextCleaner()

In [57]:
new_data['type']        = new_data['type'].apply(cleaner.remove_space)
new_data['director']    = new_data['director'].apply(cleaner.separate_text)
new_data['cast']        = new_data['cast'].apply(cleaner.separate_text)
new_data['rating']      = new_data['rating'].apply(cleaner.remove_space)
new_data['listed_in']   = new_data['listed_in'].apply(cleaner.separate_text)
new_data['description'] = new_data['description'].apply(cleaner.remove_punc)

In [None]:
new_data.head()

In [59]:
new_data['BoW'] = new_data.apply(lambda row: ' '.join(row.dropna().values), axis=1)
new_data.drop(new_data.columns[:-1], axis=1, inplace=True)

In [None]:
new_data.head()

In [61]:
tfid = TfidfVectorizer()
tfid_matrix = tfid.fit_transform(new_data['BoW'])

In [None]:
cosine_sim = cosine_similarity(tfid_matrix, tfid_matrix)
cosine_sim

In [None]:
cosine_sim

In [64]:
np.save('tfidf_matrix.npy', tfid_matrix)
np.save('cosine_sim_matrix.npy', cosine_sim)

In [65]:
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfid, f)

In [66]:
final_data = netflix_data[['title', 'type']]

In [None]:
final_data.head()

In [68]:
final_data.to_csv('movie_data.csv',index=False)

<div style="background-color: #8B0000; border-radius: 2px; border: #000000 solid; padding: 15px; font-size: 100%; text-align: center;">
    <h2 align="center" style="color: #FFFFFF;"><b>Movie Recommendation System (FLIX-HUB)</b></h2>
</div>


In [70]:
import re
class FlixHub:
    def __init__(self, df, cosine_sim):
        self.df = df
        self.cosine_sim = cosine_sim
    
    def recommendation(self, title, total_result=5, threshold=0.5):
        idx = self.find_id(title)
        self.df['similarity'] = self.cosine_sim[idx]
        sort_df = self.df.sort_values(by='similarity', ascending=False)[1:total_result+1]
        
        movies = sort_df['title'][sort_df['type'] == 'Movie']
        tv_shows = sort_df['title'][sort_df['type'] == 'TV Show']
        
        similar_movies = []
        similar_tv_shows = []
        
        for i, movie in enumerate(movies):
            similar_movies.append('{}. {}'.format(i+1, movie))
        
        for i, tv_show in enumerate(tv_shows):
            similar_tv_shows.append('{}. {}'.format(i+1, tv_show))
        
        return similar_movies, similar_tv_shows

    def find_id(self, name):
        for index, string in enumerate(self.df['title']):
            if re.search(name, string):
                return index
        return -1

In [None]:
flix_hub = FlixHub(final_data, cosine_sim)
movies, tv_shows = flix_hub.recommendation('My Little Pony: A New Generation', total_result=10, threshold=0.5)

print('Similar Movie(s) list:')
for movie in movies:
    print(movie)

print('\nSimilar TV_show(s) list:')
for tv_show in tv_shows:
    print(tv_show)

In [None]:
flix_hub = FlixHub(final_data, cosine_sim)
movies, tv_shows = flix_hub.recommendation('Vendetta: Truth, Lies and The Mafia', total_result=10, threshold=0.5)

print('Similar Movie(s) list:')
for movie in movies:
    print(movie)

print('\nSimilar TV_show(s) list:')
for tv_show in tv_shows:
    print(tv_show)