In [23]:
import os, re
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# EXPLORACI√ìN DE DATOS

In [24]:
def extract_year(title: str):
    if not isinstance(title, str):
        return np.nan
    m = re.search(r"\((\d{4})\)$", title.strip())
    return int(m.group(1)) if m else np.nan

def split_genres(s: str):
    if pd.isna(s) or s == "":
        return ["(no genres listed)"]
    return s.split("|")

def describe_df(name, df: pd.DataFrame, head_n=5):
    print(f"\n===== {name} =====")
    print("shape:", df.shape)
    print("columnas:", list(df.columns))
    print(f"\n{head_n} primeras filas:")
    print(df.head(head_n))
    print("\nNulos por columna:")
    print(df.isna().sum())



##Cargamos la data

In [25]:
TRAIN = "movies_train.csv"
TEST  = "movies_test.csv"
ML_MOVIES = "/content/movies.csv"

if not os.path.exists(TRAIN): raise FileNotFoundError(TRAIN)
if not os.path.exists(TEST):  raise FileNotFoundError(TEST)
if not os.path.exists(ML_MOVIES): raise FileNotFoundError(ML_MOVIES)

train = pd.read_csv(TRAIN)
test  = pd.read_csv(TEST)
ml_movies = pd.read_csv(ML_MOVIES)

##Revisamos los datos

### Train

In [26]:
describe_df("TRAIN", train)



===== TRAIN =====
shape: (6819, 23)
columnas: ['movieId', 'title', 'genres', '(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

5 primeras filas:
   movieId                             title          genres  \
0      619                         Ed (1996)          Comedy   
1    33826                Saint Ralph (2004)    Comedy|Drama   
2     1298       Pink Floyd: The Wall (1982)   Drama|Musical   
3   140289              Men & Chicken (2015)    Comedy|Drama   
4     3064  Poison Ivy: New Seduction (1997)  Drama|Thriller   

   (no genres listed)  Action  Adventure  Animation  Children  Comedy  Crime  \
0                   0       0          0          0         0       1      0   
1                   0       0          0          0         0       1      0   
2                   0       0          0          

### Test

In [27]:
describe_df("TEST", test)



===== TEST =====
shape: (2923, 2)
columnas: ['movieId', 'title']

5 primeras filas:
   movieId                                   title
0    45635       Notorious Bettie Page, The (2005)
1     1373  Star Trek V: The Final Frontier (1989)
2     7325                  Starsky & Hutch (2004)
3      389              Colonel Chabert, Le (1994)
4     8920                Country Girl, The (1954)

Nulos por columna:
movieId    0
title      0
dtype: int64


### Movies

In [28]:
describe_df("MOVIELENS movies.csv", ml_movies.sample(min(5, len(ml_movies)), random_state=42))


===== MOVIELENS movies.csv =====
shape: (5, 3)
columnas: ['movieId', 'title', 'genres']

5 primeras filas:
       movieId                             title  \
4884      4990  Jimmy Neutron: Boy Genius (2001)   
22971   116698              Dead Men Tell (1941)   
26257   125517                   The D.I. (1957)   
57524   196541         Makar - Pathfinder (1984)   
39134   156511              Feudin' Fools (1952)   

                                    genres  
4884   Adventure|Animation|Children|Comedy  
22971  Comedy|Crime|Drama|Mystery|Thriller  
26257                                Drama  
57524                   Adventure|Children  
39134                               Comedy  

Nulos por columna:
movieId    0
title      0
genres     0
dtype: int64


### Cruce de movieId entre Train/Test y MovieLens

In [29]:
needed_ids = set(train["movieId"].tolist() + test["movieId"].tolist())
movies_sub = ml_movies[ml_movies["movieId"].isin(needed_ids)].copy()
print(f"\nTotal movieId requeridos: {len(needed_ids)}")
print("Encontrados en movies.csv:", movies_sub.shape[0])


Total movieId requeridos: 9742
Encontrados en movies.csv: 9454


### Explorar g√©neros

In [30]:
movies_sub["genres"] = movies_sub["genres"].fillna("(no genres listed)")
all_genres = []
for s in movies_sub["genres"]:
    all_genres.extend(split_genres(s))

genre_counts = Counter(all_genres)
print("\nTop 15 g√©neros (conteo en train+test):")
for g, c in genre_counts.most_common(15):
    print(f"  {g:15s} -> {c}")

no_genres = movies_sub[movies_sub["genres"].eq("(no genres listed)")].shape[0]
print("\nPel√≠culas sin g√©nero:", no_genres)


Top 15 g√©neros (conteo en train+test):
  Drama           -> 4276
  Comedy          -> 3655
  Thriller        -> 1851
  Action          -> 1736
  Romance         -> 1569
  Adventure       -> 1196
  Crime           -> 1173
  Horror          -> 941
  Sci-Fi          -> 917
  Fantasy         -> 738
  Children        -> 626
  Mystery         -> 558
  Animation       -> 527
  Documentary     -> 421
  War             -> 378

Pel√≠culas sin g√©nero: 33


### Explorar a√±os

In [31]:
movies_sub["year"] = movies_sub["title"].apply(extract_year)
print("\nA√±o (resumen):")
print(movies_sub["year"].describe())

missing_year = movies_sub["year"].isna().sum()
print("A√±os faltantes:", missing_year)


A√±o (resumen):
count    9442.000000
mean     1994.391231
std        18.683605
min      1902.000000
25%      1987.000000
50%      1999.000000
75%      2008.000000
max      2018.000000
Name: year, dtype: float64
A√±os faltantes: 12


### Datos duplicados

In [32]:
dups = movies_sub[movies_sub.duplicated("movieId", keep=False)]
print("\nDuplicados por movieId:", dups.shape[0])


Duplicados por movieId: 0


## LIMPIEZA Y CONSTRUCCION DE FEATURES

In [33]:
def build_features_from_movies(movies_df: pd.DataFrame):
    df = movies_df.copy()
    df["genres"] = df["genres"].fillna("(no genres listed)")
    genre_set = set()
    for s in df["genres"]:
        genre_set.update(s.split("|"))
    genre_list = sorted(list(genre_set))

    for g in genre_list:
        df[g] = df["genres"].apply(lambda s: 1 if g in s.split("|") else 0).astype(int)

    df["year"] = df["title"].apply(extract_year)
    df["year"] = df["year"].fillna(df["year"].median())
    feat_cols = genre_list + ["year"]
    return df, feat_cols

def main():
    needed_ids = set(train["movieId"].tolist() + test["movieId"].tolist())
    movies_sub = ml_movies[ml_movies["movieId"].isin(needed_ids)].copy()
    movies_feats, feat_cols = build_features_from_movies(movies_sub)

    # IDs disponibles
    id_to_row = movies_feats.set_index("movieId")
    available_ids = set(id_to_row.index)

    def get_safe_features(df_ids):
        feats = []
        for mid in df_ids:
            if mid in available_ids:
                feats.append(id_to_row.loc[mid, feat_cols].values)
            else:
                empty = [0] * (len(feat_cols) - 1) + [movies_feats["year"].median()]
                feats.append(empty)
        return pd.DataFrame(feats, columns=feat_cols)

    X_train_raw = get_safe_features(train["movieId"].values)
    X_test_raw  = get_safe_features(test["movieId"].values)

    # Escalado
    scaler = StandardScaler()
    X_all = pd.concat([X_train_raw, X_test_raw], axis=0).reset_index(drop=True)
    X_all_scaled = pd.DataFrame(scaler.fit_transform(X_all), columns=feat_cols)

    X_train_scaled = X_all_scaled.iloc[:len(X_train_raw)].reset_index(drop=True)
    X_test_scaled  = X_all_scaled.iloc[len(X_train_raw):].reset_index(drop=True)

    os.makedirs("outputs", exist_ok=True)
    X_train_scaled.to_csv("outputs/X_train_scaled.csv", index=False)
    X_test_scaled.to_csv("outputs/X_test_scaled.csv", index=False)

    print("\nFeatures generados correctamente.")
    print(f"Train shape: {X_train_scaled.shape}")
    print(f"Test shape: {X_test_scaled.shape}")
    print(f"Columnas: {feat_cols}")

In [34]:
if __name__ == "__main__":
    main()


Features generados correctamente.
Train shape: (6819, 21)
Test shape: (2923, 21)
Columnas: ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'year']


## Obtenci√≥n de los posters

In [2]:
# Rutas
movielens_links = pd.read_csv("links.csv")
moviegenre = pd.read_csv("MovieGenre.csv", encoding="latin1")
print("MovieLens links.csv ‚Üí", movielens_links.shape)
print("MovieGenre.csv ‚Üí", moviegenre.shape)

# Cruce (inner join)
merged = pd.merge(movielens_links, moviegenre, on="imdbId", how="inner")
merged_needed = merged[merged["movieId"].isin(needed_ids)]

print(f"üé¨ Pel√≠culas requeridas (train+test): {len(needed_ids)}")
print(f"üì∏ Pel√≠culas con p√≥ster disponible: {len(merged_needed)}")
print(f"‚úÖ Cobertura de p√≥sters: {len(merged_needed)/len(needed_ids)*100:.2f}%\n")


NameError: name 'pd' is not defined

## Descarga de posters

In [None]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

POSTER_DIR = "posters"
os.makedirs(POSTER_DIR, exist_ok=True)

def download_poster(row):
    movie_id = row["movieId"]
    url = row["Poster"]
    path = os.path.join(POSTER_DIR, f"{movie_id}.jpg")

    if os.path.exists(path):
        return "skip"

    if pd.isna(url) or not isinstance(url, str) or not url.startswith("http"):
        return "invalid"

    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            with open(path, "wb") as f:
                f.write(response.content)
            return "ok"
        else:
            return f"fail({response.status_code})"
    except Exception:
        return "error"

subset = merged_needed    # Solo train + test

max_threads = 20  
status_counter = {"ok": 0, "skip": 0, "fail": 0, "error": 0, "invalid": 0}

print(f"Iniciando descarga de {len(subset)} p√≥sters con {max_threads} hilos...\n")

with ThreadPoolExecutor(max_workers=max_threads) as executor:
    futures = [executor.submit(download_poster, row) for _, row in subset.iterrows()]
    for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Descargando")):
        result = future.result()
        if result.startswith("fail"):
            status_counter["fail"] += 1
        elif result == "error":
            status_counter["error"] += 1
        elif result == "invalid":
            status_counter["invalid"] += 1
        elif result == "skip":
            status_counter["skip"] += 1
        else:
            status_counter["ok"] += 1

print("\nDescarga completa.")
print(f"Nuevas descargas: {status_counter['ok']}")
print(f"Omitidas (ya exist√≠an): {status_counter['skip']}")
print(f"Fallidas: {status_counter['fail']}")
print(f"Errores: {status_counter['error']}")
print(f"URLs inv√°lidas: {status_counter['invalid']}")


## Extracci√≥n de Caracter√≠sticas Visuales

### Histogramas de color (RGB + HSV)

In [None]:
import cv2

OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_color_histograms(image_path):
    img = cv2.imread(image_path)
    if img is None:
        return [np.nan]*192
    img = cv2.resize(img, (128, 128))
    # --- RGB ---
    hist_r = cv2.calcHist([img], [0], None, [32], [0,256])
    hist_g = cv2.calcHist([img], [1], None, [32], [0,256])
    hist_b = cv2.calcHist([img], [2], None, [32], [0,256])
    # --- HSV ---
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist_h = cv2.calcHist([hsv], [0], None, [32], [0,180])
    hist_s = cv2.calcHist([hsv], [1], None, [32], [0,256])
    hist_v = cv2.calcHist([hsv], [2], None, [32], [0,256])
    
    # Normalizar y concatenar
    features = np.concatenate([hist_r, hist_g, hist_b, hist_h, hist_s, hist_v]).flatten()
    features = features / np.sum(features)
    return features.tolist()

movie_ids = subset["movieId"].values
color_features = []

for movie_id in tqdm(movie_ids, desc="Extrayendo RGB+HSV", total=len(movie_ids)):
    path = os.path.join(POSTER_DIR, f"{movie_id}.jpg")
    feats = extract_color_histograms(path)
    color_features.append([movie_id] + feats)

cols = ["movieId"] + [f"color_{i}" for i in range(len(color_features[0])-1)]
df_color = pd.DataFrame(color_features, columns=cols)
df_color.to_csv(os.path.join(OUTPUT_DIR, "poster_color_features.csv"), index=False)

print("Histograms RGB+HSV guardados en poster_color_features.csv")

### HOG

In [None]:
from skimage import color, io, transform
from skimage.feature import hog

def extract_hog_features(image_path):
    try:
        img = io.imread(image_path)
        img_gray = color.rgb2gray(img)
    except Exception:
        # Imagen corrupta
        return None
    img_gray = transform.resize(img_gray, (128, 128))

    hog_feats = hog(
        img_gray,
        pixels_per_cell=(16, 16),
        cells_per_block=(2, 2),
        orientations=9,
        feature_vector=True
    )
    return hog_feats

hog_features = []
dim_example = None

for movie_id in tqdm(movie_ids, desc="Extrayendo HOG", total=len(movie_ids)):
    path = os.path.join(POSTER_DIR, f"{movie_id}.jpg")
    feats = extract_hog_features(path)
    if feats is not None:
        if dim_example is None:
            dim_example = len(feats)
        hog_features.append([movie_id] + feats.tolist())
    else:
        hog_features.append([movie_id] + [np.nan] * (dim_example or 0))
if dim_example is not None:
    cols = ["movieId"] + [f"hog_{i}" for i in range(dim_example)]
    df_hog = pd.DataFrame(hog_features, columns=cols)
    df_hog.to_csv(os.path.join(OUTPUT_DIR, "poster_hog_features.csv"), index=False)
    print(f"HOG guardados en {OUTPUT_DIR}/poster_hog_features.csv")
    print(f"Dimensi√≥n del vector HOG: {dim_example}")
else:
    print("No se pudo extraer ning√∫n HOG v√°lido.")


### Zernike

In [None]:
import mahotas

def extract_zernike(image_path, radius=64, degree=8):
    try:
        img = cv2.imread(image_path)
        if img is None:
            return None
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    except Exception:
        return None
    gray = cv2.resize(gray, (128, 128))
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    zernike = mahotas.features.zernike_moments(thresh, radius=radius, degree=degree)
    return zernike

zernike_features = []
dim_example = None

for movie_id in tqdm(movie_ids, desc="Extrayendo Zernike", total=len(movie_ids)):
    path = os.path.join(POSTER_DIR, f"{movie_id}.jpg")
    feats = extract_zernike(path)
    if feats is not None:
        if dim_example is None:
            dim_example = len(feats)
        zernike_features.append([movie_id] + feats.tolist())
    else:
        zernike_features.append([movie_id] + [np.nan] * (dim_example or 0))

# === GUARDAR RESULTADOS ===
if dim_example is not None:
    cols = ["movieId"] + [f"zernike_{i}" for i in range(dim_example)]
    df_zernike = pd.DataFrame(zernike_features, columns=cols)
    df_zernike.to_csv(os.path.join(OUTPUT_DIR, "poster_zernike_features.csv"), index=False)
    print(f"Momentos Zernike guardados en {OUTPUT_DIR}/poster_zernike_features.csv")
    print(f"Dimensi√≥n del vector Zernike: {dim_example}")
else:
    print("No se pudo extraer ning√∫n descriptor Zernike v√°lido.")

### Combinar las feactures con el x_test_scaled.csv

In [None]:
COLOR_PATH   = os.path.join(OUTPUT_DIR, "poster_color_features.csv")
HOG_PATH     = os.path.join(OUTPUT_DIR, "poster_hog_features.csv")
ZERNIKE_PATH = os.path.join(OUTPUT_DIR, "poster_zernike_features.csv")
X_TRAIN_PATH = os.path.join(OUTPUT_DIR, "X_train_scaled.csv")
X_TEST_PATH  = os.path.join(OUTPUT_DIR, "X_test_scaled.csv")

df_color   = pd.read_csv(COLOR_PATH)
df_hog     = pd.read_csv(HOG_PATH)
df_zernike = pd.read_csv(ZERNIKE_PATH)
x_train = pd.read_csv(X_TRAIN_PATH)
x_test  = pd.read_csv(X_TEST_PATH)

df_visual = df_color.merge(df_hog, on="movieId", how="outer")
df_visual = df_visual.merge(df_zernike, on="movieId", how="outer")

train_full = train.merge(df_visual, on="movieId", how="left")
test_full  = test.merge(df_visual, on="movieId", how="left")

x_train_full = pd.concat([x_train, train_full.drop(columns=["movieId"])], axis=1)
x_test_full  = pd.concat([x_test,  test_full.drop(columns=["movieId"])], axis=1)

x_train_full.to_csv(X_TRAIN_PATH, index=False)
x_test_full.to_csv(X_TEST_PATH, index=False)