In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1]:
# Import Pandas
import pandas as pd

# Load Movies Metadata
df = pd.read_csv('../data/processed/features.csv', low_memory=False)

# Print the first three rows
df.head(3)

Unnamed: 0,description,genero_principal,subgeneros,tono,estilo_narrativo,temas_clave,influencias_cinematograficas,audiencia_objetivo
0,A young sorceress discovers her dormant powers...,Fantasía,"Aventura, Coming of Age",Mágico y Aventura,Lineal,"Descubrimiento personal, Poder y responsabilid...","Harry Potter, El viaje de Chihiro",Familiar
1,Three siblings discover a mysterious tome in t...,Aventura Fantástica,"Fantasía, Aventura",Mágico,Lineal,"Trabajo en equipo, Descubrimiento personal, Fa...","Las Crónicas de Narnia, El Laberinto del Fauno",Familiar
2,A reluctant apprentice who always shunned magi...,Aventura Fantástica,Fantasía épica,Inspirador,Lineal,"Aceptación del destino, Superación personal, P...","El Señor de los Anillos, Harry Potter",Familiar


In [5]:
def jaccard_similarity(str1, str2):
    """Calculate Jaccard similarity between two strings."""
    set1 = set(str1.lower().replace(',', ' ').split())
    set2 = set(str2.lower().replace(',', ' ').split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def create_similarity_matrices(df):
    """Create similarity matrices for each field and an overall similarity matrix."""
    n_stories = len(df)
    fields = df.columns
    
    # Initialize a dictionary to store similarity matrices for each field
    similarity_matrices = {}
    
    # Create similarity matrix for each field
    for field in fields:
        matrix = np.zeros((n_stories, n_stories))
        for i in range(n_stories):
            for j in range(n_stories):
                similarity = jaccard_similarity(str(df[field][i]), str(df[field][j]))
                matrix[i][j] = similarity
        similarity_matrices[field] = matrix
    
    # Create overall similarity matrix (average of all fields)
    overall_matrix = np.mean([matrix for matrix in similarity_matrices.values()], axis=0)
    similarity_matrices['overall'] = overall_matrix
    
    return similarity_matrices

def get_formatted_matrices(df):
    """Get formatted similarity matrices as pandas DataFrames."""
    matrices = create_similarity_matrices(df)
    story_labels = [f'Story {i}' for i in range(len(df))]
    
    formatted_matrices = {}
    for field in matrices.keys():
        matrix_df = pd.DataFrame(
            matrices[field],
            index=story_labels,
            columns=story_labels
        )
        formatted_matrices[field] = matrix_df
    
    return formatted_matrices

# Example usage:
"""
# Assuming you have your DataFrame 'df' already:
matrices = get_formatted_matrices(df)

# To get a specific matrix:
description_matrix = matrices['description']
overall_matrix = matrices['overall']

# To print any matrix with percentage formatting:
print(matrices['overall'].applymap(lambda x: f"{x:.2%}"))
"""

'\n# Assuming you have your DataFrame \'df\' already:\nmatrices = get_formatted_matrices(df)\n\n# To get a specific matrix:\ndescription_matrix = matrices[\'description\']\noverall_matrix = matrices[\'overall\']\n\n# To print any matrix with percentage formatting:\nprint(matrices[\'overall\'].applymap(lambda x: f"{x:.2%}"))\n'

In [6]:
matrices = get_formatted_matrices(df)
# To get a specific matrix:
description_matrix = matrices['description']
overall_matrix = matrices['overall']

In [11]:
overall_matrix[0:15].T

Unnamed: 0,Story 0,Story 1,Story 2,Story 3,Story 4,Story 5,Story 6,Story 7,Story 8,Story 9,Story 10,Story 11,Story 12,Story 13,Story 14
Story 0,1.0,0.375431,0.344742,0.488003,0.341288,0.431596,0.473474,0.333991,0.30303,0.463145,0.330114,0.502201,0.465022,0.390814,0.304067
Story 1,0.375431,1.0,0.452763,0.329377,0.455473,0.344298,0.379165,0.391808,0.168056,0.254446,0.308176,0.347447,0.365327,0.259028,0.12931
Story 2,0.344742,0.452763,1.0,0.331731,0.473838,0.283004,0.48683,0.381079,0.139622,0.363058,0.350973,0.399966,0.334273,0.186004,0.167293
Story 3,0.488003,0.329377,0.331731,1.0,0.331208,0.458011,0.496624,0.323027,0.266785,0.414238,0.567708,0.479374,0.708025,0.352804,0.328704
Story 4,0.341288,0.455473,0.473838,0.331208,1.0,0.282209,0.34095,0.31703,0.078125,0.352904,0.284474,0.416721,0.300595,0.166165,0.14798
Story 5,0.431596,0.344298,0.283004,0.458011,0.282209,1.0,0.425954,0.294886,0.290959,0.388753,0.323947,0.455026,0.443866,0.354379,0.282626
Story 6,0.473474,0.379165,0.48683,0.496624,0.34095,0.425954,1.0,0.545833,0.267411,0.476815,0.357386,0.555496,0.49534,0.342187,0.288938
Story 7,0.333991,0.391808,0.381079,0.323027,0.31703,0.294886,0.545833,1.0,0.133259,0.282468,0.444196,0.325329,0.323864,0.19375,0.165341
Story 8,0.30303,0.168056,0.139622,0.266785,0.078125,0.290959,0.267411,0.133259,1.0,0.491204,0.159035,0.284167,0.269851,0.519838,0.384615
Story 9,0.463145,0.254446,0.363058,0.414238,0.352904,0.388753,0.476815,0.282468,0.491204,1.0,0.312405,0.625393,0.42405,0.377083,0.269052
