In [7]:
import re

import numpy as np

import pandas as pd

import spacy

from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def clean_text(text):
    """
    Nettoyage du dataset: text en miniscule, suppression des chiffres et de tout caractere autre que les lettres
    """

    text = text.lower()

    text = re.sub(r'\d+', '', text)
    
    return " ".join(re.split('\W+', text))

#pour la lemmatisation et stopwords
#le dataset lemmatisé ou pas / stopwords ou pas ne change pas le résultat
nlp = spacy.load("en_core_web_sm") #en_core_web_trf

Pour le choix de la vectorizaction: [Sentence Transformer](https://www.sbert.net/)
Il s'agit de vectorizer le text avec la tecnique appellée Word Embedding
Une description du modèle choisi se trouve [ici](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

In [9]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def embedded_text(df, text):

    #Créaction d'un array dans la colonne vectorizée
    embed_mat = np.array([x for x in df["embedded"]])

    #vectorisation de l'input - le film qui l'utilisateur tapera
    embedding = model.encode(text)
    
    #Créer l'embedding pour l'input et le répliquer le le nombre de fois la taille du dataset
    m = np.array([embedding]) * len(df)
    
    #calculer la distance entre les vecteurs de l'input et de chaque ligne du dataset
    sim_mat = cosine_similarity(m, embed_mat)

    #créer une collone pour afficher les similarités
    df['sim_score'] = sim_mat[0]

    #afficher les résultats par ordre décroissant
    similarity_results = df.sort_values('sim_score', ascending = False)

    return similarity_results

In [10]:
df = pd.read_csv("all_movies.csv")

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,cast,directors,fullplot,fullplot.1,genres,title,plot
0,0,"[""Charles Kayser"",""John Ott""]","[""William K.L. Dickson""]",A stationary camera looks at a large anvil wit...,A stationary camera looks at a large anvil wit...,"[""Short""]",Blacksmith Scene,Three men hammer on an anvil and pass a bottle...
1,1,"[""A.C. Abadie"",""Gilbert M. 'Broncho Billy' And...","[""Edwin S. Porter""]",Among the earliest existing films in American ...,Among the earliest existing films in American ...,"[""Short"",""Western""]",The Great Train Robbery,A group of bandits stage a brazen train hold-u...
2,2,"[""Martin Fuller"",""Mrs. William Bechtel"",""Walte...","[""Harold M. Shaw""]","Thanks to the Fresh Air Fund, a slum child esc...","Thanks to the Fresh Air Fund, a slum child esc...","[""Short"",""Drama"",""Fantasy""]",The Land Beyond the Sunset,"A young boy, opressed by his mother, goes on a..."
3,3,"[""Frank Powell"",""Grace Henderson"",""James Kirkw...","[""D.W. Griffith""]","A greedy tycoon decides, on a whim, to corner ...","A greedy tycoon decides, on a whim, to corner ...","[""Short"",""Drama""]",A Corner in Wheat,"A greedy tycoon decides, on a whim, to corner ..."
4,4,"[""Winsor McCay""]","[""Winsor McCay"",""J. Stuart Blackton""]",Cartoonist Winsor McCay agrees to create a lar...,Cartoonist Winsor McCay agrees to create a lar...,"[""Animation"",""Short"",""Comedy""]","Winsor McCay, the Famous Cartoonist of the N.Y...","Cartoon figures announce, via comic strip ball..."


In [12]:
df = df[["cast","directors","fullplot","genres","title","plot"]]

In [13]:
df.head()

Unnamed: 0,cast,directors,fullplot,genres,title,plot
0,"[""Charles Kayser"",""John Ott""]","[""William K.L. Dickson""]",A stationary camera looks at a large anvil wit...,"[""Short""]",Blacksmith Scene,Three men hammer on an anvil and pass a bottle...
1,"[""A.C. Abadie"",""Gilbert M. 'Broncho Billy' And...","[""Edwin S. Porter""]",Among the earliest existing films in American ...,"[""Short"",""Western""]",The Great Train Robbery,A group of bandits stage a brazen train hold-u...
2,"[""Martin Fuller"",""Mrs. William Bechtel"",""Walte...","[""Harold M. Shaw""]","Thanks to the Fresh Air Fund, a slum child esc...","[""Short"",""Drama"",""Fantasy""]",The Land Beyond the Sunset,"A young boy, opressed by his mother, goes on a..."
3,"[""Frank Powell"",""Grace Henderson"",""James Kirkw...","[""D.W. Griffith""]","A greedy tycoon decides, on a whim, to corner ...","[""Short"",""Drama""]",A Corner in Wheat,"A greedy tycoon decides, on a whim, to corner ..."
4,"[""Winsor McCay""]","[""Winsor McCay"",""J. Stuart Blackton""]",Cartoonist Winsor McCay agrees to create a lar...,"[""Animation"",""Short"",""Comedy""]","Winsor McCay, the Famous Cartoonist of the N.Y...","Cartoon figures announce, via comic strip ball..."


In [14]:
#remove Nan
df = df.dropna()

In [16]:
df["text"] = df["cast"] + df["directors"] + df["fullplot"] +df["genres"] + df["title"] + df["plot"]

In [17]:
#appliquer le nettoyage du texte
df["text_clean"] = df["text"].apply(lambda row: clean_text(row))

In [18]:
df.head()

Unnamed: 0,cast,directors,fullplot,genres,title,plot,text,text_clean
0,"[""Charles Kayser"",""John Ott""]","[""William K.L. Dickson""]",A stationary camera looks at a large anvil wit...,"[""Short""]",Blacksmith Scene,Three men hammer on an anvil and pass a bottle...,"[""Charles Kayser"",""John Ott""][""William K.L. Di...",charles kayser john ott william k l dickson a...
1,"[""A.C. Abadie"",""Gilbert M. 'Broncho Billy' And...","[""Edwin S. Porter""]",Among the earliest existing films in American ...,"[""Short"",""Western""]",The Great Train Robbery,A group of bandits stage a brazen train hold-u...,"[""A.C. Abadie"",""Gilbert M. 'Broncho Billy' And...",a c abadie gilbert m broncho billy anderson g...
2,"[""Martin Fuller"",""Mrs. William Bechtel"",""Walte...","[""Harold M. Shaw""]","Thanks to the Fresh Air Fund, a slum child esc...","[""Short"",""Drama"",""Fantasy""]",The Land Beyond the Sunset,"A young boy, opressed by his mother, goes on a...","[""Martin Fuller"",""Mrs. William Bechtel"",""Walte...",martin fuller mrs william bechtel walter edwi...
3,"[""Frank Powell"",""Grace Henderson"",""James Kirkw...","[""D.W. Griffith""]","A greedy tycoon decides, on a whim, to corner ...","[""Short"",""Drama""]",A Corner in Wheat,"A greedy tycoon decides, on a whim, to corner ...","[""Frank Powell"",""Grace Henderson"",""James Kirkw...",frank powell grace henderson james kirkwood l...
4,"[""Winsor McCay""]","[""Winsor McCay"",""J. Stuart Blackton""]",Cartoonist Winsor McCay agrees to create a lar...,"[""Animation"",""Short"",""Comedy""]","Winsor McCay, the Famous Cartoonist of the N.Y...","Cartoon figures announce, via comic strip ball...","[""Winsor McCay""][""Winsor McCay"",""J. Stuart Bla...",winsor mccay winsor mccay j stuart blackton c...


In [19]:
#appliquer le nettoyage du texte
df["text_clean"] = df["text"].apply(lambda row: clean_text(row))

In [20]:
#créer l'embedding 
df["embedded"] = df["text_clean"].apply(lambda row: model.encode(row))

In [21]:
#export le dataset embedded (car l'embedding prend assez de temps)
df.to_csv("movies_embedded.csv")

**Tests**

In [22]:
test = embedded_text(df, "The Godfather")

In [23]:
res = test[["title","plot","sim_score"]]

In [24]:
res.head(15)

Unnamed: 0,title,plot,sim_score
2828,The Godfather,The aging patriarch of an organized crime dyna...,0.667662
5771,The Godfather: Part III,In the midst of trying to legitimize his busin...,0.647936
3055,The Godfather: Part II,The early life and career of Vito Corleone in ...,0.614185
17826,I Knew It Was You: Rediscovering John Cazale,A portrait of the acting craft of John Cazale ...,0.605067
8310,Jane Austen's Mafia!,Takeoff on the Godfather with the son of a maf...,0.527654
2343,The Brotherhood,The son of a powerful Mafia don comes home fro...,0.519522
17333,Neon Flesh,"Hoping to earn his mother's respect, a young h...",0.517793
13267,C(r)ook,A killer for the Russian Mafia in Vienna wants...,0.509353
5762,The Freshman,Clark Kellogg is a young man starting his firs...,0.501665
3189,The Passenger,"A frustrated war correspondent, unable to find...",0.500335
