# Recommendation System

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('netflix_data (1).csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.columns

1. **`show_id`** – A unique identifier for each movie or TV show.  
2. **`type`** – Specifies whether the content is a *Movie* or a *TV Show*.  
3. **`title`** – The name of the movie or TV show.  
4. **`director`** – Name of the director(s) who created the movie or show.  
5. **`cast`** – A list of main actors/actresses in the movie or show.  
6. **`country`** – The country where the movie or TV show was produced.  
7. **`date_added`** – The date the content was added to the platform.  
8. **`release_year`** – The year when the movie or TV show was originally released.  
9. **`rating`** – The age classification (e.g., PG, R, TV-MA) assigned to the content.  
10. **`duration`** – The length of the content (e.g., *90 min* for movies, *3 Seasons* for TV shows).  
11. **`listed_in`** – Genres/categories the content belongs to (e.g., *Drama, Comedy*).  
12. **`description`** – A short summary or synopsis of the movie or TV show.  

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Unknown', inplace=True)

In [None]:
df['rating'].fillna(df['rating'].mode()[0], inplace=True)

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
object_columns = df.select_dtypes(include=['object']).columns
print("Object type columns:")
print(object_columns)

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumerical type columns:")
print(numerical_columns)

In [None]:
def classify_features(df):
    categorical_features = []
    non_categorical_features = []
    discrete_features = []
    continuous_features = []

    for column in df.columns:
        if df[column].dtype == 'object':
            if df[column].nunique() < 40:
                categorical_features.append(column)
            else:
                non_categorical_features.append(column)
        elif df[column].dtype in ['int64', 'float64']:
            if df[column].nunique() < 10:
                discrete_features.append(column)
            else:
                continuous_features.append(column)

    return categorical_features, non_categorical_features, discrete_features, continuous_features

In [None]:
categorical, non_categorical, discrete, continuous = classify_features(df)

In [None]:
print("Categorical Features:", categorical)
print("Non-Categorical Features:", non_categorical)
print("Discrete Features:", discrete)
print("Continuous Features:", continuous)

In [None]:
for i in categorical:
    print(i)
    print(df[i].unique())
    print()

In [None]:
for i in categorical:
    print(i)
    print(df[i].value_counts())
    print()

In [None]:
for i in categorical:
    plt.figure(figsize=(20, 8))
    ax = sns.countplot(x=i, data=df, palette='hls')

    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f'{height}', 
                    xy=(p.get_x() + p.get_width() / 2., height),
                    xytext=(0, 10),  
                    textcoords='offset points',  
                    ha='center', va='center')  
    
    plt.xticks(rotation = 90)
    plt.show()

In [None]:
import plotly.express as px

for i in categorical:
    counts = df[i].value_counts()
    fig = px.pie(counts, values=counts.values, names=counts.index, title=f'Distribution of {i}')
    fig.show()

In [None]:
import plotly.graph_objects as go

movie_counts = df['release_year'].value_counts().sort_index()

fig = go.Figure(data=go.Bar(x=movie_counts.index, y=movie_counts.values))
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)',  
    font_color='white', 
    title='Number of Movies Released Each Year',  
    xaxis=dict(title='Year'),  
    yaxis=dict(title='Number of Movies')
)
fig.update_traces(marker_color='red')
fig.show()

In [None]:
import plotly.graph_objects as go

movie_type_counts = df['type'].value_counts()

fig = go.Figure(data=go.Pie(labels=movie_type_counts.index, values=movie_type_counts.values))

fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)', 
    font_color='white',  
    title='Distribution of Content Types',
)
fig.update_traces(marker=dict(colors=['red', 'blue']))  

fig.show()

In [None]:
import plotly.express as px

top_countries = df['country'].value_counts().head(10)

fig = px.treemap(
    names=top_countries.index, 
    parents=["" for _ in top_countries.index], 
    values=top_countries.values
)

fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)', 
    font_color='white',  
    title='Top Countries with Highest Number of Movies',
)

fig.show()

In [None]:
country_movie_counts = df['country'].value_counts().reset_index()
country_movie_counts.columns = ['Country', 'Movie Count']

fig = px.choropleth(
    data_frame=country_movie_counts, 
    locations='Country', 
    locationmode='country names',
    color='Movie Count', 
    title='Number of Movies Released By Country',
    color_continuous_scale='Reds', 
    range_color=(0, country_movie_counts['Movie Count'].max()),
    labels={'Movie Count': 'Number of Movies'}
)

fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)', 
    font_color='white'
)

fig.show()

In [None]:
ratings = list(df['rating'].value_counts().index)
rating_counts = list(df['rating'].value_counts().values)

fig = go.Figure(data=[go.Bar(
    x=ratings,
    y=rating_counts,
    marker_color='#E50914'
)])

fig.update_layout(
    title='Movie Ratings Distribution',
    xaxis_title='Rating',
    yaxis_title='Count',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0.7)',
    font=dict(
        color='white'
    )
)

fig.show()

In [None]:
durations = list(df['duration'].value_counts().index)
duration_counts = list(df['duration'].value_counts().values)

fig = go.Figure(data=[go.Bar(
    x=durations,
    y=duration_counts,
    marker_color='#E50914'
)])

fig.update_layout(
    title='Movie Durations Distribution',
    xaxis_title='Duration',
    yaxis_title='Count',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0.7)',
    font=dict(
        color='white'
    )
)

fig.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

titles = df['title'].dropna().values  

text = ' '.join(titles)

wordcloud = WordCloud(background_color='black', colormap='Reds').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Titles', color='black')
plt.show()

In [None]:
descriptions = df['description'].dropna().values  

text = ' '.join(descriptions)

wordcloud = WordCloud(background_color='black', colormap='Reds').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Descriptions', color='white')
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df['combined_features'] = df['title'] + " " + df['listed_in'] + " " + df['description']

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def recommend_movies(title, df, cosine_sim):
    idx = df[df['title'] == title].index

    if len(idx) == 0:
        return "Title not found in dataset. Please try another movie/show."

    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  

    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices][['title', 'listed_in', 'description']]

In [None]:
print(recommend_movies("Kota Factory", df, cosine_sim))