In [None]:
# --- Librer√≠as b√°sicas ---
import pandas as pd
import numpy as np

# --- Librer√≠as de Deep Learning ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- Librer√≠as de Preprocesamiento ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# --- Librer√≠as de Visualizaci√≥n ---
import matplotlib.pyplot as plt
import seaborn as sns

# Confirmar que se han importado correctamente
print("‚úÖ Librer√≠as importadas correctamente.")



‚úÖ Librer√≠as importadas correctamente.


In [None]:
# --- Cargar dataset desde Google Drive ---
df = pd.read_csv("https://drive.google.com/uc?export=download&id=18glOMWxHUOPDQa6t2mwRG6vJN1GbpiIA")

# --- Configuraci√≥n para visualizar mejor los DataFrames ---
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# --- Verificar las primeras filas del DataFrame ---
print("üîé Primeras filas del DataFrame original:")
print(df.head())



üîé Primeras filas del DataFrame original:
                                      url              title   type  \
0  https://www.netflix.com/title/60000724       Forrest Gump  movie   
1   https://www.netflix.com/title/1154386  The Fifth Element  movie   
2  https://www.netflix.com/title/60031236  Kill Bill: Vol. 1  movie   
3  https://www.netflix.com/title/70021659            Jarhead  movie   
4   https://www.netflix.com/title/1080395         Unforgiven  movie   

                      genres  releaseYear     imdbId  imdbAverageRating  \
0             Drama, Romance       1994.0  tt0109830                8.8   
1  Action, Adventure, Sci-Fi       1997.0  tt0119116                7.6   
2    Action, Crime, Thriller       2003.0  tt0266697                8.2   
3      Biography, Drama, War       2005.0  tt0418763                7.0   
4             Drama, Western       1992.0  tt0105695                8.2   

   imdbNumVotes                                 availableCountries  
0     231

In [None]:
# --- Eliminar filas con valores nulos ---
df = df.dropna().reset_index(drop=True)

# --- Ajustar tipos de columnas ---
df['releaseYear'] = df['releaseYear'].astype('Int64')
df['imdbNumVotes'] = df['imdbNumVotes'].astype('Int64')

# --- Eliminar columnas innecesarias ---
df = df.drop(['url', 'imdbId', 'availableCountries'], axis=1)

# --- Mostrar informaci√≥n general despu√©s del preprocesamiento ---
print("‚úÖ Preprocesamiento b√°sico realizado.")
print(df.info())



‚úÖ Preprocesamiento b√°sico realizado.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6697 entries, 0 to 6696
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              6697 non-null   object 
 1   type               6697 non-null   object 
 2   genres             6697 non-null   object 
 3   releaseYear        6697 non-null   Int64  
 4   imdbAverageRating  6697 non-null   float64
 5   imdbNumVotes       6697 non-null   Int64  
dtypes: Int64(2), float64(1), object(3)
memory usage: 327.1+ KB
None


In [None]:
# --- Crear nueva columna para distinguir entre pel√≠culas y series ---
df['type_new'] = np.where(df['type'] == 'movie', 0, 1)

# --- Funci√≥n para codificar g√©neros relevantes ---
def encode_new(genres):
    encoding = [0, 0, 0]  # Drama, Comedy, Other
    if pd.isna(genres):
        return [None, None, None]
    elif 'Drama' in genres and 'Comedy' not in genres:
        encoding[0] = 1
    elif 'Comedy' in genres and 'Drama' not in genres:
        encoding[1] = 1
    elif 'Comedy' not in genres and 'Drama' not in genres:
        encoding[2] = 1
    else:
        return [None, None, None]
    return encoding

# --- Aplicar codificaci√≥n de g√©neros ---
df[['drama', 'comedy', 'other']] = pd.DataFrame(df['genres'].apply(encode_new).tolist(), index=df.index)

# --- Eliminar filas donde la codificaci√≥n de g√©neros no fue posible ---
df = df.dropna(subset=['drama', 'comedy', 'other'])

# --- Eliminar columnas originales ---
df = df.drop(['genres', 'type'], axis=1)

# --- Mostrar primeras filas para comprobar ---
print("‚úÖ Codificaci√≥n de g√©neros completada.")
print(df[['drama', 'comedy', 'other']].head())



‚úÖ Codificaci√≥n de g√©neros completada.
   drama  comedy  other
0    1.0     0.0    0.0
1    0.0     0.0    1.0
2    0.0     0.0    1.0
3    1.0     0.0    0.0
4    1.0     0.0    0.0


In [None]:
# --- Funci√≥n para eliminar valores at√≠picos basados en 3-sigma ---
def remove_outliers_3sigma(df, columns):
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# --- Aplicar eliminaci√≥n de outliers ---
df = remove_outliers_3sigma(df, ['releaseYear', 'imdbNumVotes'])

# --- Mostrar tama√±o del DataFrame despu√©s de eliminar outliers ---
print(f"‚úÖ Outliers eliminados. Tama√±o del DataFrame: {df.shape}")



‚úÖ Outliers eliminados. Tama√±o del DataFrame: (5548, 8)


In [None]:
# --- Funci√≥n para categorizar el rating de IMDB ---
def categorize_imdb_rating(rating):
    if rating < 6:
        return "Mal"
    elif 6 <= rating < 7:
        return "Bien"
    elif 7 <= rating < 8:
        return "Muy bien"
    elif 8 <= rating <= 10:
        return "Excelente"
    else:
        return "No categorizado"

# --- Aplicar funci√≥n ---
df['rating_category'] = df['imdbAverageRating'].apply(categorize_imdb_rating)

# --- Ver distribuci√≥n de clases ---
print("‚úÖ Categor√≠as de rating creadas:")
print(df['rating_category'].value_counts())



‚úÖ Categor√≠as de rating creadas:
rating_category
Bien         1984
Muy bien     1787
Mal          1314
Excelente     463
Name: count, dtype: int64


In [None]:
# --- Estandarizar releaseYear e imdbNumVotes ---
scaler = StandardScaler()
columns_to_standardize = ['releaseYear', 'imdbNumVotes']
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

# --- Mostrar estad√≠sticos despu√©s de estandarizaci√≥n ---
print("‚úÖ Variables num√©ricas estandarizadas:")
print(df[columns_to_standardize])



‚úÖ Variables num√©ricas estandarizadas:
      releaseYear  imdbNumVotes
1       -1.658254      4.252866
3       -0.809996      1.418538
4       -2.188416      3.572972
6       -1.340158      1.837221
7       -0.809996      1.858180
...           ...           ...
6691     1.204617     -0.540043
6692     1.204617     -0.537759
6693     1.204617     -0.537090
6694     1.204617     -0.520272
6696     1.204617     -0.536654

[5548 rows x 2 columns]


In [None]:
# --- Seleccionar 5.000 filas de manera aleatoria ---
df = df.sample(5000, random_state=40).reset_index(drop=True)

# --- Definir X (features) e y (target) ---
X = df[['type_new', 'releaseYear', 'imdbNumVotes', 'drama', 'comedy', 'other']]
y = df['rating_category']

# --- Codificar etiquetas de y ---
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# --- Dividir en conjuntos de entrenamiento y test ---
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=40)

# --- Convertir datos a tensores ---
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# --- Crear dataset y dataloader ---
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# --- Mostrar tama√±os de los datasets ---
print(f"‚úÖ Datos preparados. Tama√±o X_train: {X_train_tensor.shape}, Tama√±o y_train: {y_train_tensor.shape}")



‚úÖ Datos preparados. Tama√±o X_train: torch.Size([4000, 6]), Tama√±o y_train: torch.Size([4000])


In [None]:
# --- Definir Red Neuronal ---
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(6, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 4)  # 4 clases posibles

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

best_accuracy = 0
best_params = {}

for lr in [0.001, 0.01, 0.1]:
    for momentum in [0.5, 0.9]:
        model = NeuralNetwork()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

        # Entrenamiento (solo 5 epochs para evaluar r√°pido)
        for epoch in range(5):
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        # Evaluaci√≥n en test
        with torch.no_grad():
            outputs = model(X_test_tensor)
            _, predicted = torch.max(outputs, 1)
            accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)

        print(f"lr={lr}, momentum={momentum}, accuracy={accuracy:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'lr': lr, 'momentum': momentum}

print(f"\nüöÄ Mejor combinaci√≥n encontrada: {best_params}, con accuracy: {best_accuracy:.4f}")




lr=0.001, momentum=0.5, accuracy=0.3190
lr=0.001, momentum=0.9, accuracy=0.3880
lr=0.01, momentum=0.5, accuracy=0.3810
lr=0.01, momentum=0.9, accuracy=0.4470
lr=0.1, momentum=0.5, accuracy=0.4520
lr=0.1, momentum=0.9, accuracy=0.4410

üöÄ Mejor combinaci√≥n encontrada: {'lr': 0.1, 'momentum': 0.5}, con accuracy: 0.4520


In [None]:
# --- Entrenar la red neuronal ---
num_epochs = 30

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Mostrar p√©rdida cada 5 √©pocas
    if (epoch + 1) % 5 == 0:
        print(f"üìà Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("‚úÖ Entrenamiento finalizado.")



üìà Epoch [5/30], Loss: 1.1093
üìà Epoch [10/30], Loss: 1.0995
üìà Epoch [15/30], Loss: 1.0994
üìà Epoch [20/30], Loss: 1.0924
üìà Epoch [25/30], Loss: 1.0850
üìà Epoch [30/30], Loss: 1.0839
‚úÖ Entrenamiento finalizado.


In [None]:
# --- Invertir codificaci√≥n de etiquetas para mostrar predicciones ---
predicted_labels = label_encoder.inverse_transform(predicted.numpy())
real_labels = label_encoder.inverse_transform(y_test_tensor.numpy())

# --- Crear DataFrame de comparaci√≥n ---
results = pd.DataFrame({'Real': real_labels, 'Predicho': predicted_labels})

# --- Mostrar 15 ejemplos ---
print("üìã Muestra de predicciones reales vs predichas:")
print(results.sample(15, random_state=40))


üìã Muestra de predicciones reales vs predichas:
          Real  Predicho
204   Muy bien      Bien
71        Bien  Muy bien
594       Bien      Bien
672   Muy bien  Muy bien
14    Muy bien  Muy bien
64        Bien      Bien
340   Muy bien      Bien
135       Bien       Mal
350        Mal  Muy bien
976       Bien  Muy bien
181       Bien  Muy bien
538       Bien      Bien
246       Bien      Bien
982  Excelente  Muy bien
366        Mal      Bien


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test_tensor, predicted))
print(classification_report(y_test_tensor, predicted, target_names=label_encoder.classes_))


[[127   0  44 192]
 [  7   8   1  72]
 [105   0  64  62]
 [ 61   1  14 242]]
              precision    recall  f1-score   support

        Bien       0.42      0.35      0.38       363
   Excelente       0.89      0.09      0.16        88
         Mal       0.52      0.28      0.36       231
    Muy bien       0.43      0.76      0.55       318

    accuracy                           0.44      1000
   macro avg       0.56      0.37      0.36      1000
weighted avg       0.49      0.44      0.41      1000



In [None]:
# --- PREDICCI√ìN SOBRE PEL√çCULAS FICTICIAS ---

# Datos originales de ejemplo (sin estandarizar a√∫n)
nuevas_pelis = pd.DataFrame({
    'type_new': [0, 1],                 # 0: pel√≠cula, 1: serie
    'releaseYear': [2015, 2020],        # a√±os de estreno
    'imdbNumVotes': [50000, 5000],      # n√∫mero de votos
    'drama': [1, 0],                    # g√©nero drama
    'comedy': [0, 1],                   # g√©nero comedia
    'other': [0, 0]                     # otros g√©neros
})

# Estandarizar releaseYear y imdbNumVotes igual que el conjunto original
nuevas_pelis[['releaseYear', 'imdbNumVotes']] = scaler.transform(nuevas_pelis[['releaseYear', 'imdbNumVotes']])

# Convertir a tensor
nuevas_tensor = torch.tensor(nuevas_pelis.values, dtype=torch.float32)

# Predecir
with torch.no_grad():
    predicciones = model(nuevas_tensor)
    _, clases_predichas = torch.max(predicciones, 1)

# Decodificar etiquetas
categorias_predichas = label_encoder.inverse_transform(clases_predichas.numpy())

# Mostrar resultados
for i, categoria in enumerate(categorias_predichas, 1):
    print(f"Pel√≠cula {i} ‚Üí Predicci√≥n de calidad IMDb: {categoria}")


Pel√≠cula 1 ‚Üí Predicci√≥n de calidad IMDb: Bien
Pel√≠cula 2 ‚Üí Predicci√≥n de calidad IMDb: Bien
