#### Initial Data Load and Preview

In [None]:
import pandas as pd

# Load the CSV file
file_path = '.../netflix_movies.csv'
netflix_data = pd.read_csv(file_path)

# Displaying the first few rows of the dataframe
netflix_data.head()

#### Data Cleaning - Checking Missing Values and Data Types

In [None]:
# Checking for missing values and data types
missing_values = netflix_data.isnull().sum()
data_types = netflix_data.dtypes

missing_values, data_types


#### Cleaning 'Year' and 'Votes' Fields

In [None]:
# Cleaning 'Year' field: Extracting the first year mentioned in the format (YYYY)
netflix_data['year'] = netflix_data['year'].str.extract(r'(\d{4})').astype(float)

# Converting 'Votes' to numeric: Removing commas and converting to integer
netflix_data['votes'] = netflix_data['votes'].str.replace(',', '').astype(float)

# Checking the changes
netflix_data[['year', 'votes']].head(), netflix_data.dtypes


#### Dropping Rows with Missing 'votes' and 'rating'

In [None]:
# Dropping rows where 'votes' or 'rating' are missing
netflix_data_cleaned = netflix_data.dropna(subset=['votes', 'rating'])

# Checking the changes
netflix_data_cleaned.isnull().sum(), netflix_data_cleaned.shape


### Data Visualization

#### Distribution of Ratings and Votes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Setting the seaborn style and matplotlib style for plots
sns.set(style="darkgrid")
plt.style.use("dark_background")

# Plotting the distribution of Ratings
plt.figure(figsize=(10, 6))
sns.histplot(netflix_data_cleaned['rating'], kde=True, palette="Blues_r")
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

# Plotting the distribution of Votes
plt.figure(figsize=(10, 6))
sns.histplot(netflix_data_cleaned['votes'], kde=True, palette="Blues_r")
plt.title("Distribution of Votes")
plt.xlabel("Votes")
plt.ylabel("Frequency")
plt.show()


#### Normal distribution of Votes looks off, checking and converting to log scale for visualization.

In [None]:
# Displaying the 'votes' column to check its current state
netflix_data_cleaned['votes']


In [None]:
# Re-plotting the distribution of Votes with a logarithmic scale
plt.figure(figsize=(10, 6))
sns.histplot(netflix_data_cleaned['votes'], kde=True, palette="Blues_r", log_scale=True)
plt.title("Distribution of Votes (Log Scale)")
plt.xlabel("Votes (Log Scale)")
plt.ylabel("Frequency")
plt.show()


#### Genre Analysis

In [None]:
from collections import Counter
import numpy as np

# Splitting the 'genre' column into individual genres and counting occurrences
genre_list = netflix_data_cleaned['genre'].dropna().str.split(',').sum()
genre_counts = Counter(genre_list)

# Converting to a DataFrame for visualization
genre_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count'])
genre_df = genre_df.sort_values(by='Count', ascending=False)

# Plotting the most common genres
plt.figure(figsize=(12, 8))
sns.barplot(x='Count', y='Genre', data=genre_df.head(10), palette="Blues_r")
plt.title("Top 10 Genres on Netflix")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.show()


#### Yearly Trends Analysis

In [None]:
# Analyzing Yearly Trends: Number of Releases and Average Ratings per Year

# Grouping data by year
yearly_data = netflix_data_cleaned.groupby('year').agg(Count=('title', 'count'), Average_Rating=('rating', 'mean'))

# Resetting index for plotting
yearly_data = yearly_data.reset_index()

# Plotting Number of Releases per Year
plt.figure(figsize=(12, 6))
sns.lineplot(x='year', y='Count', data=yearly_data, marker='o', color="lightblue")
plt.title("Number of Releases per Year")
plt.xlabel("Year")
plt.ylabel("Number of Releases")
plt.xticks(np.arange(yearly_data['year'].min(), yearly_data['year'].max()+1, 1.0), rotation=90)
plt.grid(True, which='both', axis='both', linestyle='-', linewidth=0.5)
plt.gca().xaxis.grid(False)  # Disable x-axis grid lines
plt.show()

# Plotting Average Ratings per Year
plt.figure(figsize=(12, 6))
sns.lineplot(x='year', y='Average_Rating', data=yearly_data, marker='o', color="lightgreen")
plt.title("Average Ratings per Year")
plt.xlabel("Year")
plt.ylabel("Average Rating")
plt.xticks(np.arange(yearly_data['year'].min(), yearly_data['year'].max()+1, 1.0), rotation=90)
plt.grid(True, which='both', axis='both', linestyle='-', linewidth=0.5)
plt.gca().xaxis.grid(False)  # Disable x-axis grid lines
plt.show()


#### Rating vs. Votes Correlation

In [None]:
# Analyzing the correlation between Ratings and Votes

# Plotting a scatterplot for Rating vs. Votes
plt.figure(figsize=(12, 6))
sns.scatterplot(x='rating', y='votes', data=netflix_data_cleaned, palette="Blues_r", hue='rating', legend=False)
plt.title("Rating vs. Votes")
plt.xlabel("Rating")
plt.ylabel("Votes")
plt.show()


#### Duration Analysis

In [None]:
# Extracting numeric values from 'duration' and converting to minutes
netflix_data_cleaned['duration_min'] = netflix_data_cleaned['duration'].str.extract(r'(\d+)').astype(float)

# Some entries are series with episode durations, we assume these are 1-hour episodes for simplicity
# Assuming 60 minutes per episode for series
netflix_data_cleaned.loc[netflix_data_cleaned['duration'].str.contains('min')==False, 'duration_min'] *= 60

# Plotting the distribution of Duration in Minutes

plt.figure(figsize=(12, 6))
sns.histplot(netflix_data_cleaned['duration_min'], kde=True, palette="Blues_r")
plt.title("Distribution of Duration (in minutes)")
plt.xlabel("Duration (Minutes)")
plt.ylabel("Frequency")
plt.show()

#### Word Clouds for Descriptions and Genres

In [None]:
from wordcloud import WordCloud

# Combining all descriptions into a single string
all_descriptions = ' '.join(netflix_data_cleaned['description'])

# Generating a word cloud
wordcloud = WordCloud(width = 800, height = 800, 
                      background_color ='black', 
                      colormap="Blues", 
                      min_font_size = 10).generate(all_descriptions)

# Plotting the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

# Word cloud for genres, ensuring they reflect unique frequency or prominence

# For genres: Counting unique occurrences
unique_genre_counts = genre_year_data['genre'].value_counts()

# Generating a word cloud for unique genres
unique_genre_wordcloud = WordCloud(width=800, height=800, background_color='black', colormap="Blues").generate_from_frequencies(unique_genre_counts)

# Plotting the WordCloud for genres
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(unique_genre_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()


#### Text Analysis of Descriptions

In [None]:
from collections import Counter
import re

# Function to clean and split text into words
def clean_and_split(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'\W+', ' ', text)
    # Convert to lowercase and split into words
    words = text.lower().split()
    return words

# List of filler words to be removed
filler_words = set(["the", "of", "and", "or", "a", "an", "in", "to", "for", "with", "on", "at", "by", "from", "as", "is", "that", "this", "it", "are", "be", "was", "were", "but", "if", "into", "no", "not", "such", "their", "then", "there", "these", "they", "which", "who", "will", "your", "his", "her", "he", "she", "them", "you", "i", "me", "my", "we", "us", "our"])

# Parsing each description, removing filler words, and counting occurrences
word_counts = Counter()
for description in netflix_data_cleaned['description']:
    words = clean_and_split(description)
    filtered_words = [word for word in words if word not in filler_words]
    word_counts.update(filtered_words)

# Displaying the most common words
word_counts.most_common(20)

#### Setting Up the Recommendation System

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Preparation: Combining genre and description
netflix_data_cleaned['combined_features'] = netflix_data_cleaned['genre'].str.cat(netflix_data_cleaned['description'], sep=" ")

# Processing text data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(netflix_data_cleaned['combined_features'].fillna(''))

# Computing the Cosine Similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get title from index and vice versa
indices = pd.Series(netflix_data_cleaned.index, index=netflix_data_cleaned['title']).drop_duplicates()

# Displaying the shape of the TF-IDF matrix and a snippet of the cosine similarity matrix
tfidf_matrix.shape, cosine_sim[:5, :5]


#### Creating the recommendation function

In [None]:
# Recommendation function: Returns titles most similar to the given title

def recommend_titles(title, cosine_sim=cosine_sim, df=netflix_data_cleaned, indices=indices):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all titles with that title
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the titles based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar titles
    sim_scores = sim_scores[1:11]

    # Get the title indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar titles
    return df['title'].iloc[movie_indices]

# Testing the recommendation function with "Breaking Bad"
recommended_titles = recommend_titles("Breaking Bad")
recommended_titles


#### Function encountered errors with values not found in the dataset or duplicate values

##### Confirming Breaking Bad is within the dataset

In [None]:
# Checking if "Breaking Bad" is in the dataset and its index
if "Breaking Bad" in indices:
    breaking_bad_index = indices["Breaking Bad"]
else:
    breaking_bad_index = None

breaking_bad_index


#### Function kept failing

In [None]:
# Re-running the recommendation function for "Breaking Bad" using the first index
recommended_titles_for_breaking_bad = recommend_titles("Breaking Bad")
recommended_titles_for_breaking_bad


#### Modified the function to take into account duplicate values.

In [None]:
def recommend_titles_modified(title, cosine_sim=cosine_sim, df=netflix_data_cleaned, indices=indices):
    idx = indices[title].iloc[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores[1:11]]
    return df['title'].iloc[movie_indices]


#### Recommendation function results

In [None]:
# Testing the modified recommendation function with "Breaking Bad"
recommended_titles_for_breaking_bad = recommend_titles_modified("Breaking Bad")
recommended_titles_for_breaking_bad

In [None]:
# Testing the recommendation function with "Brooklyn Nine-Nine"
recommended_titles_for_brooklyn_nine_nine = recommend_titles_modified("Brooklyn Nine-Nine")
recommended_titles_for_brooklyn_nine_nine
