# Preprocesamiento

In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocesamiento

# Carga de Datos stemming eliminacion de stop words

In [None]:
# Descargar el conjunto de stopwords de nltk si no lo tienes
nltk.download('stopwords')

# Inicializar el Stemmer y las stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Función de preprocesamiento de texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres especiales y números
    text = re.sub(r"[^a-z\s]", "", text)
    # Tokenizar, eliminar stopwords y aplicar stemming
    tokens = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    # Unir los tokens procesados
    return " ".join(tokens)

# Cargar los datos
movies_path = "data/rotten_tomatoes_movies.csv"
reviews_path = "data/rotten_tomatoes_critic_reviews.csv"

movies_df = pd.read_csv(movies_path)
reviews_df = pd.read_csv(reviews_path)

# Preprocesar la columna de críticas
print("Valores nulos en 'review_content':", reviews_df['review_content'].isnull().sum())
reviews_df = reviews_df[reviews_df['review_content'].notnull()]
reviews_df['processed_review'] = reviews_df['review_content'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Valores nulos en 'review_content': 65806


# Vectorizar

## Matriz TF-IDF

In [5]:
# Crear la matriz TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(reviews_df['processed_review'])

# Vincular críticas con títulos de películas
reviews_with_titles = reviews_df.merge(movies_df[['rotten_tomatoes_link', 'movie_title']], 
                                       on='rotten_tomatoes_link', how='left')

# Mostrar las primeras filas para confirmar
print(reviews_with_titles[['movie_title', 'processed_review']].head())

                                         movie_title  \
0  Percy Jackson & the Olympians: The Lightning T...   
1  Percy Jackson & the Olympians: The Lightning T...   
2  Percy Jackson & the Olympians: The Lightning T...   
3  Percy Jackson & the Olympians: The Lightning T...   
4  Percy Jackson & the Olympians: The Lightning T...   

                                    processed_review  
0  fantasi adventur fuse greek mytholog contempor...  
1  uma thurman medusa gorgon coiffur writh snake ...  
2  topnotch cast dazzl special effect tide teen n...  
3  whether audienc get behind lightn thief hard p...  
4  what realli lack lightn thief genuin sens wond...  


# Motor de busqueda 

## Distancia Jaccard y tratamiento de la query (stemming)

In [None]:
# Función para calcular la similitud de Jaccard entre dos conjuntos de palabras
def jaccard_similarity(query, review):
    # Tokenizar las críticas y la consulta en conjuntos de palabras
    query_set = set(query.split())
    review_set = set(review.split())
    
    # Calcular la intersección y la unión de los conjuntos
    intersection = len(query_set.intersection(review_set))
    union = len(query_set.union(review_set))
    
    return intersection / union

# Solicitar la consulta del usuario
user_query = input("Ingresa tu consulta: ")

# Preprocesar la consulta
user_query = user_query.lower()

# Calcular la similitud de Jaccard para cada crítica
reviews_with_titles['jaccard_similarity'] = reviews_with_titles['processed_review'].apply(lambda review: jaccard_similarity(user_query, review))

# Ordenar las críticas por similitud de Jaccard, de mayor a menor
sorted_reviews = reviews_with_titles.sort_values(by='jaccard_similarity', ascending=False)

# Mostrar las primeras 5 críticas más similares
print(sorted_reviews[['movie_title', 'processed_review', 'jaccard_similarity']].head())


                     movie_title            processed_review  \
593655               Out of Time             shrewd thriller   
417573  In the Heat of the Night             superb thriller   
725100             Secret Window  solidbutimperfect thriller   
28065                Dream Lover            satisfi thriller   
25052                    Witness             strike thriller   

        jaccard_similarity  
593655                 0.5  
417573                 0.5  
725100                 0.5  
28065                  0.5  
25052                  0.5  
