In [1]:
# Import necessary libraries
import numpy as np
import ast
import psycopg2
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Connect to PostgreSQL database
conn = psycopg2.connect(
    dbname="manga_db",
    user="user_master",
    password="PassMaster97",
    host="localhost",
    port="5432"
)
cur = conn.cursor()

In [3]:
# Fetch data from the database
df = pd.read_sql("""
    SELECT DISTINCT ON (mb.original_title)
    mb.id,
    mb.title,
    mb.is_editorial,
    mb.original_title,
    me.embeddings_synopsis_tags,
    me.embeddings_features
    FROM mangas_base mb
    JOIN mangas_embeddings me ON mb.id = me.manga_id
    WHERE me.embeddings_synopsis_tags IS NOT NULL
    AND me.embeddings_features IS NOT NULL
    ORDER BY mb.original_title, mb.is_editorial DESC;
""", conn)

  df = pd.read_sql("""


In [4]:
# Function to safely evaluate string representations of lists
def safe_eval(val):
    if pd.isna(val) or val is None:
        return np.array([]) 
    try:
        return np.array(ast.literal_eval(val))
    except Exception as e:
        print(f"❌ Error con valor: {val}\n{e}")
        return np.array([])

# Apply the safe_eval function to the embeddings columns
df["emb_syn_tags_np"] = df["embeddings_synopsis_tags"].apply(safe_eval)
df["emb_features_np"] = df["embeddings_features"].apply(safe_eval)

# Filter the DataFrame to only include rows with non-empty embeddings
df_filtered = df[
    (df["emb_syn_tags_np"].apply(lambda x: x.size > 0)) &
    (df["emb_features_np"].apply(lambda x: x.size > 0))
]


In [5]:
# 1. Separate the DataFrame into two based on the 'is_editorial' column
df_editorial = df_filtered[df_filtered["is_editorial"] == True].copy()
df_no_editorial = df_filtered[df_filtered["is_editorial"] == False].copy()

In [6]:

# 2. Convert the embeddings to numpy arrays
X_editorial_tags = np.vstack(df_editorial["emb_syn_tags_np"].values)
X_otros_tags = np.vstack(df_no_editorial["emb_syn_tags_np"].values)

In [7]:
# 3. Calculate the cosine similarity matrix
sim_matrix_tags = cosine_similarity(X_editorial_tags, X_otros_tags)

In [8]:
# 4. Get the indices of the most similar pairs
idx_editorial = df_editorial.index
idx_otros = df_no_editorial.index

In [13]:
# IDs de los mangas
id_editorial = 114176
id_no_editorial = 116477

# Paso 1: Obtener la posición del manga editorial (fila)
row_idx = df_editorial[df_editorial["id"] == id_editorial].index[0]
row_pos = list(idx_editorial).index(row_idx)

# Paso 2: Obtener la posición del manga no editorial (columna)
col_idx = df_no_editorial[df_no_editorial["id"] == id_no_editorial].index[0]
col_pos = list(idx_otros).index(col_idx)

# Paso 3: Obtener la fila de similitudes del manga editorial
sim_row = sim_matrix_tags[row_pos]

# Paso 4: Ordenar las similitudes en orden descendente y obtener posiciones
sorted_indices = np.argsort(sim_row)[::-1]

# Paso 5: Obtener la posición del manga no editorial
position = np.where(sorted_indices == col_pos)[0][0] + 1  # +1 para ranking 1-based

print(f"El manga con ID {id_no_editorial} está en la posición {position} respecto al manga editorial {id_editorial}.")

El manga con ID 116477 está en la posición 22 respecto al manga editorial 114176.


In [9]:
# 5. Create a DataFrame to store the results
top_n = 10
resultados_tags = []

for i, idx in enumerate(idx_editorial):
    nombre_editorial = df_filtered.loc[idx, "title"]
    similares_idx = idx_otros[sim_matrix_tags[i].argsort()[::-1][:top_n]]
    
    fila = {"ID": df_filtered.loc[idx, "id"], "Obra Editorial": nombre_editorial}
    for j, sim_idx in enumerate(similares_idx, 1):
        fila[f"Similar {j}"] = df_filtered.loc[sim_idx, "title"]
        fila[f"ID {j}"] = df_filtered.loc[sim_idx, "id"]
    
    resultados_tags.append(fila)

df_similares_tags = pd.DataFrame(resultados_tags)

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_similares_tags

Unnamed: 0,ID,Obra Editorial,Similar 1,ID 1,Similar 2,ID 2,Similar 3,ID 3,Similar 4,ID 4,Similar 5,ID 5,Similar 6,ID 6,Similar 7,ID 7,Similar 8,ID 8,Similar 9,ID 9,Similar 10,ID 10
0,114084,#DRCL – Midnight Children,Yasha,139796,Rengoku Jima: Kyokuchi Renai,137167,DARK MOON: Darui Jedan,171172,Deathmon,140045,#DRCL midnight children,137678,Burning Hell: Kami no Kuni,131713,Mother Ghoul,141695,MoMo: the blood taker,135383,Ingoshima,135522,SHOCK UP!,140043
1,114152,Love at Fourteen,Love Train,175168,Gomen ne Hatsukoi,152513,Itoi-kun to Hiyama-kun no Koi,125285,Senpai wa Otokonoko,129780,Sugars,146723,Chime,150548,Soushi Souai,149105,Tomokoi,150995,Dounika Naru Hibi,116151,Ano Ko ni 1000%,143974
2,114045,7000 millones de agujas,70 Oku no Hari,131560,Dead Dead Demon's Dededededestruction,133506,Yoru no Shita Machiawase,150943,Ore no Gikei ga UFO Yobou to Shiterun Desu kedo,121802,Bara no Oniwa no Kemono-tachi,122228,Ware Ware wa,141959,Aoi-chan Panic!,145166,Follower ga Kanojo ni Narimashita,151331,Jiya,131844,Uchuu Ryuugakusei Moru,139397
3,114115,"Hotel, Historias cortas de Boichi",Chikyuu Hyoukai Jiki,135363,Sayonara Watashitachi,145776,Dogs,130902,"49-Sai, Hitou Hitoritabi",129863,Sweet Home,168158,2015 Uju Teukjib Danpyeon,167689,Tabi to Gohan to Shuumatsu Sekai,158984,Hitokui Mansion to Ooya no Maison,163760,Gigi Goegoe,168348,Byeollara Gongjunim,173287
4,114233,CANIS -Dear Hatter-,Canis: Dear Hatter,120725,Canis: Dear Mr. Rain,115127,Marcello Storia,151266,Tokyo Apparel Girl,153616,Kodihaedeurimnida,168166,Okyakusama ni Koishite mo Ii desu ka?,125441,My Real Dress,154694,Otomegokoro ni Koigokoro,118563,"Boukensha no Fuku, Tsukurimasu!: Isekai de Haj...",151389,Shouboushi-san to BL Kissa,124906
5,114235,CANIS -Dear Mr.Rain-,Canis: Dear Mr. Rain,115127,Lonely Ginger no Okiniiri,124158,Dachi no Imouto,166029,Yaneura no Neko to Nidome no Hatsukoi,125952,Ore to Renai Shite Mimasen ka,124742,Koi to Yobu ni wa Kimochi Warui,127381,Tatoeba Ame ga Futtanara,121222,Koisuru Gokudou Onii-san,125032,Tayutau Kohaku,124181,Kaze no Iro made Oboeteru,125931
6,114237,CANIS -The Speaker-,Canis: The Speaker,120726,Ore no Kimochi ga Wakaru ka,119786,10th,154513,Ieneko Nora wa Oogataken ga Nigate,121430,Ookami no Ketsuzoku,117374,Yaneura no Neko to Nidome no Hatsukoi,125952,Melodramatic Fiction,125846,Kami-sama no Ude no Naka,115332,Yamada to Shounen,119966,Beelzebub,155637
7,114113,Hideout,HIDEOUT,132623,Inganmaljong,175895,Muneojin Yeoche,173867,Ppaesneun Namja,175194,Jaesaeng,170506,Eorin Gajeongbu,173661,Shikou Shounen,126446,Kusakabe-san,136407,Geuligo Namja,169048,Umi de Hirotta Kimi ni Sasagu,120038
8,114261,MADK,Devilman,158689,Mawangnimui Ijungsaenghwal,169819,Devilman Tai Yami no Teiou: DEVILMAN vs HADES,134212,Yuujin ga Yuusha,117949,Mato Seihei no Slave,159209,Rakuin no Kemonobito,139603,Akuma wa Rosario ni Kiss wa Suru,140213,Kimetsu no Yaiba,157770,Gekka Kajin LUMEN LUNAE,114369,Dororo,158913
9,114139,La música de Marie,"Youkihi, Kirara",152892,36°C Rebellion,167170,Onryou Juusanya,140747,Ryuu no Gakkou wa Yama no Ue,127032,Pygmalio,149111,Shinigami Tsuki no Amamiya-san,150134,Tsubasa Aru Mono,145754,Akai Yuki,131601,Guu-sama ni Ashita ga Kuru You ni,153854,Honzuki no Gekokujou: Shisho ni Naru Tame ni w...,130652


In [11]:
# 1. Convert the editorial features to numpy arrays
X_editorial_feat = np.vstack(df_editorial["emb_features_np"].values)
X_otros_feat = np.vstack(df_no_editorial["emb_features_np"].values)

In [12]:
# 2. Calculate the cosine similarity matrix
sim_matrix_feat = cosine_similarity(X_editorial_feat, X_otros_feat)

In [13]:
# 3. Create a DataFrame to store the results
resultados_feat = []

for i, idx in enumerate(idx_editorial):
    nombre_editorial = df_filtered.loc[idx, "title"]
    similares_idx = idx_otros[sim_matrix_feat[i].argsort()[::-1][:top_n]]
    
    fila = {"Obra Editorial": nombre_editorial}
    for j, sim_idx in enumerate(similares_idx, 1):
        fila[f"Similar {j}"] = df_filtered.loc[sim_idx, "title"]
        fila[f"ID {j}"] = df_filtered.loc[sim_idx, "id"]
    
    resultados_feat.append(fila)

df_similares_feat = pd.DataFrame(resultados_feat)

In [14]:
df_similares_feat

Unnamed: 0,Obra Editorial,Similar 1,ID 1,Similar 2,ID 2,Similar 3,ID 3,Similar 4,ID 4,Similar 5,ID 5,Similar 6,ID 6,Similar 7,ID 7,Similar 8,ID 8,Similar 9,ID 9,Similar 10,ID 10
0,#DRCL – Midnight Children,Marie Alex,170034,Ireumeul Matchumyeo,170014,Houkago no Kuroi-san,162244,Kanojo to Boku no Endroll,162304,Samadhi,170396,Kimi to Picopico,162308,Dramatic-age Saranghaejwo,169439,Bamman Jal Sajuneun Isanghan Isanim,169459,Danjjagui Gyeonggye,169665,Red Hood,160068
1,Love at Fourteen,GUNNM: Kasei Senki,133612,Koiwazurai no Ellie,149059,Danchigai,133588,Ani no Yome to Kurashiteimasu.,134786,Arakawa Under the Bridge,131518,Persona 4,140458,Persona 3,130974,Futaribeya,141591,Omairi Desu yo,115051,Baraou no Souretsu,147886
2,7000 millones de agujas,Dear Diary,167397,TO-mA,139233,Fujishiro-san-kei.,148328,Sasanaki,155953,Mozart wa Komoriuta wo Utawanai,153410,Gyon-Woo & Jik-Nyu,167661,Tactics Ogre,155498,Ero Ero Sos!,167934,Pride (Knock),168065,Wandal Wandering!,155968
3,"Hotel, Historias cortas de Boichi",Shin Yami no Koe: Kaidan,127764,Starlike Words,117553,Yeol-Ahobseumulhana,167437,Junketsu Drop,115014,Kimi ni Shika Kikoenai,155722,Bus Hashiru,131100,"Kasa no Shita, Futari",117720,Otokogokoro,118232,Kimi Note,117203,Ginga Patrol Jako,157387
4,CANIS -Dear Hatter-,Canis: Dear Hatter,120725,Paradise View,118758,Shin Kirari,129230,3 Ban Sen no Campanella,120343,Inai Boku wa Hotarumachi ni Iru,128606,Shinobeba Koi,118575,Yukidoke no Koi,119436,Kimi to Kore kara,118558,Shin Getter Robo,155562,Dajeonghan Gyeoul,167575
5,CANIS -Dear Mr.Rain-,Canis: Dear Mr. Rain,115127,Hana no Miyako de,118083,O.B.,115021,Love My Life,125086,Shibito no Koiwazurai,129092,Kani ni Sasowarete,127360,"Kowagaranai de, Soba ni Ite",118574,Gekijouban Mahou Shoujo Madoka★Magica: Shinpen...,165460,Shin Getter Robo,155562,Doukyuusei: Gold Ring & Silver Ring,120188
6,CANIS -The Speaker-,Canis: The Speaker,120726,Anti Romance,119216,Subaru to Suu-san,137570,Sword Art Online: Phantom Bullet,133398,Castle Mango,114842,Hitoribocchi wa Samishikute,145356,Anata to Taberu Shiawase: Maki to Hanazawa,153956,Harenochi Shikibu,119593,Troll Trap,168167,Meguro to Akino wa Kizukanai,120534
7,Hideout,Yoake no Yami ni Nemuru Koi,118767,Ai wa Ichiya ni Shite,126447,Akapoli,139934,Change H purple,137928,Daikanojo,139421,Change H white,137929,Yume Yume Utsutsu,164541,Mitsurugi Haruka Kiki Ippatsu!,140783,Shounen Ai no Bigaku EX,118731,Oshioki Ouji,147753
8,MADK,Tteugeopge Anajwo,168536,Boku no Omawari-san,119532,Red Candy,168626,Bungou Stray Dogs: Dead Apple,136864,King's Maker: Triple Crown,168708,Define the Relationship,170880,Dear Door,168378,Bring the Love,168878,Ouji-sama Nante Iranai,164648,Mother's Spirit,118553
9,La música de Marie,GOGO Monster,135752,Double House,126231,J no Subete,116963,Haruka na Machi e,131008,Private Opinion: Banana Fish Another Story,145701,Glass no Kamen,143184,Higurashi no Naku Koro ni: Tatarigoroshi-hen,155189,Nekosogi Radical,124914,pink,126861,Hotarubi no Mori e,143406


In [None]:
emb_obra1 = df_filtered.loc[df_filtered["id"] == 114114, "emb_syn_tags_np"].values[0]
emb_obra2 = df_filtered.loc[df_filtered["title"] == "Aharen-san wa Hakarenai", "emb_syn_tags_np"].values[0]

cos_sim = cosine_similarity([emb_obra1], [emb_obra2])[0][0]
print(f"Similitud del coseno: {cos_sim:.4f}")

Similitud del coseno: 0.7224


In [None]:
# Paso 1: Crear la matriz de similitud (si no la tienes ya)
# sim_matrix_tags = cosine_similarity(X_editorial_tags, X_otros_tags)

# Paso 2: Calcular media y desviación estándar de toda la matriz
mean_sim = sim_matrix_tags.mean()
std_sim = sim_matrix_tags.std()

# Paso 3: Calcular la matriz de Z-scores
sim_matrix_tags_zscore = (sim_matrix_tags - mean_sim) / std_sim

# Paso 4: Seleccionar los IDs de los mangas
id_editorial = 114176
id_no_editorial = 143933

# Paso 5: Localizar las posiciones de esos IDs en los índices originales
row_idx = df_editorial[df_editorial["id"] == id_editorial].index[0]
row_pos = list(idx_editorial).index(row_idx)

col_idx = df_no_editorial[df_no_editorial["id"] == id_no_editorial].index[0]
col_pos = list(idx_otros).index(col_idx)

# Paso 6: Extraer el Z-score de similitud entre los dos mangas
z_score_similarity = sim_matrix_tags_zscore[row_pos, col_pos]

# Paso 7: Extraer también la posición en el ranking original (sin z-score, para referencia)
sim_row = sim_matrix_tags[row_pos]
sorted_indices = np.argsort(sim_row)[::-1]
ranking_position = np.where(sorted_indices == col_pos)[0][0] + 1

# Resultado final
print(f"📌 Similitud Z-score entre {id_editorial} y {id_no_editorial}: {z_score_similarity:.4f}")
print(f"🏅 Posición en ranking (sin normalizar): {ranking_position}")

🔄 Similaridad original (coseno) entre 114176 y 170481: 0.7102
📊 Similaridad normalizada [0-1]: 0.9919
📌 Similitud Z-score: 2.9880
🏅 Posición en ranking: 2 de 58945
📈 Percentil: 100.00%


- z ≈ 0: Similitud normal.

- z > 1: Notablemente más similar de lo normal.

- z > 2: Muy similar — destaca.

- z < 0: Menos similar de lo que se esperaría.