In [17]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

In [18]:
# Replace 'path_to_file' with the actual path to your file
file_path = r'C:\Users\Administrator\Desktop\Final Dataset 2.csv'

In [19]:

# Define a function to normalize text
def normalize_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Define a function to tokenize text
def tokenize_text(text):
    if isinstance(text, str):
        # Split the text into words/tokens
        tokens = text.split()
        return tokens
    return text

# Define a function to remove stop words
def remove_stop_words(tokens):
    if isinstance(tokens, list):
        filtered_tokens = [word for word in tokens if word not in stop_words]
        return filtered_tokens
    return tokens

# Define a function to lemmatize tokens
def lemmatize_tokens(tokens):
    if isinstance(tokens, list):
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return lemmatized_tokens
    return tokens

# Combine the specified columns into a single text column
def combine_columns(row):
    combined_text = ' '.join([str(row[col]) for col in columns_to_normalize if isinstance(row[col], str)])
    return combined_text

# Function to recommend similar TV shows based on input title
def recommend_tv_shows(title, cosine_sim_matrix, df, top_n=5):
    # Find the index of the TV show title
    idx = df[df['Title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        return "TV show not found in the dataset. Please try another title."

    # Get the cosine similarity scores for the given TV show
    sim_scores = list(enumerate(cosine_sim_matrix[idx][0]))

    # Sort the TV shows based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top similar TV shows
    sim_scores = sim_scores[1:top_n+1]  # Exclude the first (itself) and take top_n

    # Get the indices of the similar TV shows
    show_indices = [i[0] for i in sim_scores]

    # Return the top similar TV shows
    return df['Title'].iloc[show_indices]



# Read the CSV file into a DataFrame with a specified encoding
df = pd.read_csv(file_path, encoding='latin1')

# Normalize text in relevant columns
columns_to_normalize = ['Title', 'Type', 'Premiere', 'Genre', 'Language', 'Origin', 'Director', 'Cast', 'Description']
for column in columns_to_normalize:
    df[column] = df[column].apply(normalize_text)

# Apply text normalization to all columns
df = df.applymap(normalize_text)

# Apply tokenization to the 'Description' column
df['Description'] = df['Description'].apply(tokenize_text)

# Download the stop words list
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Apply stop word removal to the 'Description' column
df['Description'] = df['Description'].apply(remove_stop_words)

# Define a lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to the 'Description' column
df['Description'] = df['Description'].apply(lemmatize_tokens)

# Combine columns into a single text column
df['Combined_Text'] = df.apply(combine_columns, axis=1)

# Perform TF-IDF on the combined text column
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['Combined_Text'])

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert the cosine similarity matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim, index=df['Title'], columns=df['Title'])


  df = df.applymap(normalize_text)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:


# Example usage:
input_title = input("Your Input TV Show Title Here")
recommendations = recommend_tv_shows(input_title, cosine_sim, df)
print(f"\nTop Recommendations for '{input_title}':")
print(recommendations)



Your Input TV Show Title HereBreaking BAd

Top Recommendations for 'Breaking BAd':
26          new amsterdam
5                     you
10           kaleidoscope
31    the good bad mother
39          the snow girl
Name: Title, dtype: object


In [17]:

import pickle

new_df = pd.read_csv(file_path, encoding='latin1')

# Serialize and save the DataFrame using pickle
pickle.dump(new_df.to_dict, open('movies_new.pkl', 'wb'))

# Save the cosine similarity matrix using pickle
pickle.dump(cosine_sim_df, open('similarity_new.pk1', 'wb'))

import os
print(os.getcwd())

C:\Users\Administrator\Desktop
