# **Setup**

In [1]:
# Reference: 
# https://www.youtube.com/watch?v=7WfoYl-EPtI&list=LL&index=6
# https://www.kaggle.com/code/rushikeshdane20/build-recommendation-system-app-with-streamlit
# https://datagy.io/python-remove-punctuation-from-string/#:~:text=One%20of%20the%20easiest%20ways,maketrans()%20method.
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://appdividend.com/2020/12/16/python-string-lower-method/#:~:text=Python%20lower()%20function%20is,the%20Python%20upper()%20method.
# https://www.analyticssteps.com/blogs/nltk-python-tutorial-beginners
# https://swatimeena989.medium.com/beginners-guide-for-preprocessing-text-data-f3156bec85ca
# https://www.analyticssteps.com/blogs/what-stemming-and-lemmatization-nlp

import pandas as pd
# import re
# pip install nltk
# import nltk
# nltk.download()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

Import data

In [2]:
df_combined = pd.read_excel("streaming_service_titles.xlsx", index_col=False);
df_prep = df_combined.copy();

In [3]:
df_prep[1:2]

Unnamed: 0,type,title,release_year,rating,genres,description,Streaming Service
1,TV Show,Blood & Water,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Netflix


# **NLP Process Workflow**

In [None]:
# Tokenization: Splits text into pieces (tokens), remove punctuation
# Stopword Removal: Removes commonly used words (such as 'the') which are not relevant to analysis
# Stemming and Lemmatization: Reduces words to base form to be analyzed as a single item
# P.O.S Tagging: Tags words to be part of speech (such as verb, noun) based on definition and context
# Information Retrieval: Extracts relevant information from source

In [4]:
df_prep["Textual Info"] = "";
for i in df_prep.index:
    textualInfo = (str(df_prep.loc[i, "rating"]) + " " + str(df_prep.loc[i, "genres"]) + " " + str(df_prep.loc[i, "description"])).lower();
    textualInfo_tokens = word_tokenize(textualInfo);
    textualInfo_cleanedTokens = [];
    for token in textualInfo_tokens:
        token = lemmatizer.lemmatize(token, pos="a");
        if (token not in stop_words) and (token not in punctuation) and (token not in textualInfo_cleanedTokens):
            textualInfo_cleanedTokens.append(token);
    textualInfo_final = "";
    for token in textualInfo_cleanedTokens:
        textualInfo_final += token + " ";
    textualInfo_final = textualInfo_final[:-1];
    df_prep.loc[i, "Textual Info"] = textualInfo_final;

In [5]:
df_prep.head()

Unnamed: 0,type,title,release_year,rating,genres,description,Streaming Service,Textual Info
0,Movie,Dick Johnson Is Dead,2020,PG-13,Documentaries,"As her father nears the end of his life, filmm...",Netflix,pg-13 documentaries father nears end life film...
1,TV Show,Blood & Water,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Netflix,tv-ma international tv shows dramas mysteries ...
2,TV Show,Ganglands,2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Netflix,tv-ma crime tv shows international action adve...
3,TV Show,Jailbirds New Orleans,2021,TV-MA,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Netflix,tv-ma docuseries reality tv feuds flirtations ...
4,TV Show,Kota Factory,2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,Netflix,tv-ma international tv shows romantic comedies...


# **Analysis**

In [6]:
tfidf = TfidfVectorizer(stop_words="english");
tfidf_matrix = tfidf.fit_transform(df_prep["Textual Info"]);

In [7]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix);
indices = pd.Series(df_prep.index, index=df_prep["title"]).drop_duplicates();

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title];
    sim_scores = list(enumerate(cosine_sim[idx]));
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True);
    sim_scores = sim_scores[1:11];
    movie_indices = [i[0] for i in sim_scores];
    return df_prep[["title","description","Streaming Service"]].iloc[movie_indices];

In [8]:
get_recommendations("Ganglands")

Unnamed: 0,title,description,Streaming Service
5305,Narcos,The true story of Colombia's infamously violen...,Netflix
3976,The Eagle of El-Se'eed,A police officer and a drug lord become embroi...,Netflix
9684,Emil and the Detectives,"When a young boy's money is stolen, the search...",Disney Plus
1905,Cold Harbour,When a war breaks out between competing gangs ...,Netflix
424,Chhota Bheem: The Rise of Kirmada,Bheem and young lord Krishna team up to protec...,Netflix
8968,Mission Force One,Five kids form an elite team to protect the un...,Disney Plus
3297,Paradise Beach,"Mehdi gets out of prison, planning to settle o...",Netflix
5113,Bright,"In an LA rife with interspecies tensions, a hu...",Netflix
2087,Santana,Two brothers — one a narcotics agent and the o...,Netflix
3552,The Good Bandit,A near-death experience spurs a feared drug lo...,Netflix
