In [23]:
import os, re
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# EXPLORACIÓN DE DATOS

In [24]:
def extract_year(title: str):
    if not isinstance(title, str):
        return np.nan
    m = re.search(r"\((\d{4})\)$", title.strip())
    return int(m.group(1)) if m else np.nan

def split_genres(s: str):
    if pd.isna(s) or s == "":
        return ["(no genres listed)"]
    return s.split("|")

def describe_df(name, df: pd.DataFrame, head_n=5):
    print(f"\n===== {name} =====")
    print("shape:", df.shape)
    print("columnas:", list(df.columns))
    print(f"\n{head_n} primeras filas:")
    print(df.head(head_n))
    print("\nNulos por columna:")
    print(df.isna().sum())



##Cargamos la data

In [25]:
TRAIN = "movies_train.csv"
TEST  = "movies_test.csv"
ML_MOVIES = "/content/movies.csv"

if not os.path.exists(TRAIN): raise FileNotFoundError(TRAIN)
if not os.path.exists(TEST):  raise FileNotFoundError(TEST)
if not os.path.exists(ML_MOVIES): raise FileNotFoundError(ML_MOVIES)

train = pd.read_csv(TRAIN)
test  = pd.read_csv(TEST)
ml_movies = pd.read_csv(ML_MOVIES)

##Revisamos los datos

### Train

In [26]:
describe_df("TRAIN", train)



===== TRAIN =====
shape: (6819, 23)
columnas: ['movieId', 'title', 'genres', '(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

5 primeras filas:
   movieId                             title          genres  \
0      619                         Ed (1996)          Comedy   
1    33826                Saint Ralph (2004)    Comedy|Drama   
2     1298       Pink Floyd: The Wall (1982)   Drama|Musical   
3   140289              Men & Chicken (2015)    Comedy|Drama   
4     3064  Poison Ivy: New Seduction (1997)  Drama|Thriller   

   (no genres listed)  Action  Adventure  Animation  Children  Comedy  Crime  \
0                   0       0          0          0         0       1      0   
1                   0       0          0          0         0       1      0   
2                   0       0          0          

### Test

In [27]:
describe_df("TEST", test)



===== TEST =====
shape: (2923, 2)
columnas: ['movieId', 'title']

5 primeras filas:
   movieId                                   title
0    45635       Notorious Bettie Page, The (2005)
1     1373  Star Trek V: The Final Frontier (1989)
2     7325                  Starsky & Hutch (2004)
3      389              Colonel Chabert, Le (1994)
4     8920                Country Girl, The (1954)

Nulos por columna:
movieId    0
title      0
dtype: int64


### Movies

In [28]:
describe_df("MOVIELENS movies.csv", ml_movies.sample(min(5, len(ml_movies)), random_state=42))


===== MOVIELENS movies.csv =====
shape: (5, 3)
columnas: ['movieId', 'title', 'genres']

5 primeras filas:
       movieId                             title  \
4884      4990  Jimmy Neutron: Boy Genius (2001)   
22971   116698              Dead Men Tell (1941)   
26257   125517                   The D.I. (1957)   
57524   196541         Makar - Pathfinder (1984)   
39134   156511              Feudin' Fools (1952)   

                                    genres  
4884   Adventure|Animation|Children|Comedy  
22971  Comedy|Crime|Drama|Mystery|Thriller  
26257                                Drama  
57524                   Adventure|Children  
39134                               Comedy  

Nulos por columna:
movieId    0
title      0
genres     0
dtype: int64


### Cruce de movieId entre Train/Test y MovieLens

In [29]:
needed_ids = set(train["movieId"].tolist() + test["movieId"].tolist())
movies_sub = ml_movies[ml_movies["movieId"].isin(needed_ids)].copy()
print(f"\nTotal movieId requeridos: {len(needed_ids)}")
print("Encontrados en movies.csv:", movies_sub.shape[0])


Total movieId requeridos: 9742
Encontrados en movies.csv: 9454


### Explorar géneros

In [30]:
movies_sub["genres"] = movies_sub["genres"].fillna("(no genres listed)")
all_genres = []
for s in movies_sub["genres"]:
    all_genres.extend(split_genres(s))

genre_counts = Counter(all_genres)
print("\nTop 15 géneros (conteo en train+test):")
for g, c in genre_counts.most_common(15):
    print(f"  {g:15s} -> {c}")

no_genres = movies_sub[movies_sub["genres"].eq("(no genres listed)")].shape[0]
print("\nPelículas sin género:", no_genres)


Top 15 géneros (conteo en train+test):
  Drama           -> 4276
  Comedy          -> 3655
  Thriller        -> 1851
  Action          -> 1736
  Romance         -> 1569
  Adventure       -> 1196
  Crime           -> 1173
  Horror          -> 941
  Sci-Fi          -> 917
  Fantasy         -> 738
  Children        -> 626
  Mystery         -> 558
  Animation       -> 527
  Documentary     -> 421
  War             -> 378

Películas sin género: 33


### Explorar años

In [31]:
movies_sub["year"] = movies_sub["title"].apply(extract_year)
print("\nAño (resumen):")
print(movies_sub["year"].describe())

missing_year = movies_sub["year"].isna().sum()
print("Años faltantes:", missing_year)


Año (resumen):
count    9442.000000
mean     1994.391231
std        18.683605
min      1902.000000
25%      1987.000000
50%      1999.000000
75%      2008.000000
max      2018.000000
Name: year, dtype: float64
Años faltantes: 12


### Datos duplicados

In [32]:
dups = movies_sub[movies_sub.duplicated("movieId", keep=False)]
print("\nDuplicados por movieId:", dups.shape[0])


Duplicados por movieId: 0


## LIMPIEZA Y CONSTRUCCION DE FEATURES

In [33]:
def build_features_from_movies(movies_df: pd.DataFrame):
    df = movies_df.copy()
    df["genres"] = df["genres"].fillna("(no genres listed)")
    genre_set = set()
    for s in df["genres"]:
        genre_set.update(s.split("|"))
    genre_list = sorted(list(genre_set))

    for g in genre_list:
        df[g] = df["genres"].apply(lambda s: 1 if g in s.split("|") else 0).astype(int)

    df["year"] = df["title"].apply(extract_year)
    df["year"] = df["year"].fillna(df["year"].median())
    feat_cols = genre_list + ["year"]
    return df, feat_cols

def main():
    needed_ids = set(train["movieId"].tolist() + test["movieId"].tolist())
    movies_sub = ml_movies[ml_movies["movieId"].isin(needed_ids)].copy()
    movies_feats, feat_cols = build_features_from_movies(movies_sub)

    # IDs disponibles
    id_to_row = movies_feats.set_index("movieId")
    available_ids = set(id_to_row.index)

    def get_safe_features(df_ids):
        feats = []
        for mid in df_ids:
            if mid in available_ids:
                feats.append(id_to_row.loc[mid, feat_cols].values)
            else:
                empty = [0] * (len(feat_cols) - 1) + [movies_feats["year"].median()]
                feats.append(empty)
        return pd.DataFrame(feats, columns=feat_cols)

    X_train_raw = get_safe_features(train["movieId"].values)
    X_test_raw  = get_safe_features(test["movieId"].values)

    # Escalado
    scaler = StandardScaler()
    X_all = pd.concat([X_train_raw, X_test_raw], axis=0).reset_index(drop=True)
    X_all_scaled = pd.DataFrame(scaler.fit_transform(X_all), columns=feat_cols)

    X_train_scaled = X_all_scaled.iloc[:len(X_train_raw)].reset_index(drop=True)
    X_test_scaled  = X_all_scaled.iloc[len(X_train_raw):].reset_index(drop=True)

    os.makedirs("outputs", exist_ok=True)
    X_train_scaled.to_csv("outputs/X_train_scaled.csv", index=False)
    X_test_scaled.to_csv("outputs/X_test_scaled.csv", index=False)

    print("\nFeatures generados correctamente.")
    print(f"Train shape: {X_train_scaled.shape}")
    print(f"Test shape: {X_test_scaled.shape}")
    print(f"Columnas: {feat_cols}")

In [34]:
if __name__ == "__main__":
    main()


Features generados correctamente.
Train shape: (6819, 21)
Test shape: (2923, 21)
Columnas: ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'year']
