In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split

# Cargar y preparar el DataFrame
df = pd.read_csv('train.csv')
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by=['id', 'date'], inplace=True)
df.dropna(subset=['valence'], inplace=True)

# Función para crear lags
def create_lags(df, n_lags):
    df_lagged = df.copy()
    for lag in range(1, n_lags + 1):
        df_lagged[f'valence_lag_{lag}'] = df_lagged.groupby('id')['valence'].shift(lag)
    df_lagged.dropna(inplace=True)
    return df_lagged

# Aplicar la creación de lags
n_lags = 7
df_lagged = create_lags(df, n_lags)

# Excluir columnas no numéricas y la columna objetivo de las características
X = df_lagged.drop(['id', 'date', 'valence'] + [f'valence_lag_{i}' for i in range(1, n_lags)], axis=1)
y = df_lagged['valence'].values

# Escalar las características
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Reshape de X para LSTM
X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Definir el modelo
model = Sequential([
    LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')

# Entrenar el modelo
model.fit(X_train, y_train, epochs=120, batch_size=64, validation_data=(X_test, y_test), verbose=2)




Epoch 1/120
4/4 - 1s - loss: 1.9494 - val_loss: 1.3329
Epoch 2/120
4/4 - 0s - loss: 1.8336 - val_loss: 1.2554
Epoch 3/120
4/4 - 0s - loss: 1.7243 - val_loss: 1.1830
Epoch 4/120
4/4 - 0s - loss: 1.6231 - val_loss: 1.1150
Epoch 5/120
4/4 - 0s - loss: 1.5221 - val_loss: 1.0530
Epoch 6/120
4/4 - 0s - loss: 1.4311 - val_loss: 0.9970
Epoch 7/120
4/4 - 0s - loss: 1.3501 - val_loss: 0.9476
Epoch 8/120
4/4 - 0s - loss: 1.2698 - val_loss: 0.9047
Epoch 9/120
4/4 - 0s - loss: 1.1970 - val_loss: 0.8671
Epoch 10/120
4/4 - 0s - loss: 1.1297 - val_loss: 0.8351
Epoch 11/120
4/4 - 0s - loss: 1.0694 - val_loss: 0.8099
Epoch 12/120
4/4 - 0s - loss: 1.0136 - val_loss: 0.7910
Epoch 13/120
4/4 - 0s - loss: 0.9656 - val_loss: 0.7788
Epoch 14/120
4/4 - 0s - loss: 0.9249 - val_loss: 0.7731
Epoch 15/120
4/4 - 0s - loss: 0.8922 - val_loss: 0.7727
Epoch 16/120
4/4 - 0s - loss: 0.8684 - val_loss: 0.7762
Epoch 17/120
4/4 - 0s - loss: 0.8494 - val_loss: 0.7800
Epoch 18/120
4/4 - 0s - loss: 0.8369 - val_loss: 0.7850
E

<keras.callbacks.History at 0x17a911ad5b0>

In [32]:
# Cargar y preparar el conjunto de prueba
test_df = pd.read_csv('test.csv')
test_df['date'] = pd.to_datetime(test_df['date'])
test_df.sort_values(by=['id', 'date'], inplace=True)
test_df['valence'] = 0

# Asumiendo que el conjunto de prueba ya ha sido transformado adecuadamente y tiene lags aplicados
n_lags = 7
df_lagged = create_lags(test_df, n_lags)
# Aplicar el mismo procesamiento que al conjunto de entrenamiento
X_test = test_df.drop(['id', 'date'], axis=1)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Realizar predicciones
predictions = model.predict(X_test_scaled)

# Añadir las predicciones al DataFrame de prueba
test_df['predicted_valence'] = predictions

# Guardar o visualizar las predicciones
test_df.to_csv('test_with_predictions.csv', index=False)
print(test_df[['id', 'date', 'predicted_valence']].head())


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- valence
Feature names seen at fit time, yet now missing:
- valence_lag_7


In [None]:
# Paso 1: Renombrar las columnas
df_merged.rename(columns={'id_kaggle': 'Id', 'valence': 'Category'}, inplace=True)

# Paso 2: Convertir y ajustar la columna 'Category'
# Convertir 'Category' a entero para eliminar cualquier decimal, luego a string
df_merged['Category'] = df_merged['Category'].astype(int).astype(str)
# Añadir comillas simples alrededor de los valores de 'Category'
df_merged['Category'] = "'" + df_merged['Category'] + "'"

# Paso 3: Seleccionar solo las columnas de interés (Id y Category)
df_final = df_merged[['Id', 'Category']]

# Paso 4: Guardar el DataFrame en un archivo CSV
df_final.to_csv('output_with_quotes.csv', index=False)
