In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('train.csv')

# Display the first 10 rows
df.head(10)


Unnamed: 0,id,date,is_weekend,steps_total,distance,sleep,app_usage,home_cluster,clusters_count,t_sport,practiced_sport,valence
0,5,2019-05-31,0,,,,-0.161625,,,,,
1,5,2019-06-01,1,,,,1.992985,,,,,
2,5,2019-06-02,1,,,,1.038469,,,,,
3,5,2019-06-03,0,,,,1.199408,,,,,
4,5,2019-06-04,0,,,,2.185056,,,,,
5,5,2019-06-05,0,,,,2.481083,,,,,
6,5,2019-06-06,0,-0.854461,,,1.157992,,,,,0.0
7,5,2019-06-07,0,-0.416468,,-1.21622,2.249959,,,-0.696035,0.0,1.0
8,5,2019-06-08,1,-0.153967,,-0.958696,1.951986,,,-0.729374,0.0,0.0
9,5,2019-06-09,1,-0.930487,,-0.755388,0.561072,,,,,


In [2]:
df_sorted = df.sort_values(by=['id', 'date'])
df_sorted


Unnamed: 0,id,date,is_weekend,steps_total,distance,sleep,app_usage,home_cluster,clusters_count,t_sport,practiced_sport,valence
0,5,2019-05-31,0,,,,-0.161625,,,,,
1,5,2019-06-01,1,,,,1.992985,,,,,
2,5,2019-06-02,1,,,,1.038469,,,,,
3,5,2019-06-03,0,,,,1.199408,,,,,
4,5,2019-06-04,0,,,,2.185056,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
110692,2689,2020-04-27,0,-0.985184,-0.377994,,,,,-0.796709,0.0,0.0
110693,2689,2020-04-28,0,-0.952450,-0.379999,,,,,-0.796709,0.0,1.0
110694,2689,2020-04-29,0,,,,,,,,,0.0
110695,2689,2020-04-30,0,-0.936189,-0.380235,,,,,-0.796709,0.0,2.0


In [3]:
num_rows_with_nan = df['valence'].isna().sum()
print(num_rows_with_nan)


100994


In [4]:
df_filtered = df.dropna(subset=['valence'])
df_filtered


Unnamed: 0,id,date,is_weekend,steps_total,distance,sleep,app_usage,home_cluster,clusters_count,t_sport,practiced_sport,valence
6,5,2019-06-06,0,-0.854461,,,1.157992,,,,,0.0
7,5,2019-06-07,0,-0.416468,,-1.216220,2.249959,,,-0.696035,0.0,1.0
8,5,2019-06-08,1,-0.153967,,-0.958696,1.951986,,,-0.729374,0.0,0.0
39,16,2019-07-15,0,-0.162837,,,0.040730,,,-0.724386,0.0,2.0
40,16,2019-07-16,0,0.248970,,-0.613253,0.366361,,,-0.476180,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
110691,2689,2020-04-26,1,,,,,,,,,2.0
110692,2689,2020-04-27,0,-0.985184,-0.377994,,,,,-0.796709,0.0,0.0
110693,2689,2020-04-28,0,-0.952450,-0.379999,,,,,-0.796709,0.0,1.0
110694,2689,2020-04-29,0,,,,,,,,,0.0


In [5]:
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

df_filtered['date'].dtype == 'datetime64[ns]'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['date'] = pd.to_datetime(df_filtered['date'])


True

In [6]:

#df_filtered['month'] = df_filtered['date'].dt.month
#df_filtered['weekday'] = df_filtered['date'].dt.weekday
df = df_filtered


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Asumimos que 'df' es tu DataFrame que ya contiene los datos como se muestra en tu imagen
# y que ya ha sido ordenado por 'id' y 'date'
sequence_length = 7  # Longitud de las secuencias de entrada
# Función para crear secuencias para cada 'id'
def create_sequences(features, valence, sequence_length=7):
    sequences = []
    output = []
    for i in range(len(features) - sequence_length + 1):
        sequences.append(features[i:(i + sequence_length)])
        output.append(valence[i + sequence_length - 1])
    return np.array(sequences), np.array(output)

# Lista para guardar las secuencias de entrada y salida
input_sequences = []
output_valence = []

# Inicialización del scaler para normalizar las características
scaler = StandardScaler()

# Loop a través de cada 'id' para crear y almacenar secuencias
for id, group in df.groupby('id'):
    group = group.sort_values('date')
    # Rellenar NaNs si es necesario
    group.fillna(method='ffill', inplace=True)
    # Asumimos que las características numéricas son todas excepto 'id', 'date', y 'valence'
    features = group.drop(['id', 'date', 'valence'], axis=1)
    # Normalizar las características
    features_scaled = scaler.fit_transform(features)
    # Recortar 'valence' para que coincida con el número de secuencias que se pueden formar
    valence = group['valence'].values
    if len(features_scaled) > sequence_length:
        feature_sequences, valence_sequences = create_sequences(features_scaled, valence, sequence_length)
        input_sequences.append(feature_sequences)
        output_valence.append(valence_sequences)

# Convertir las listas a arrays de NumPy
X = np.vstack(input_sequences)
y = np.concatenate(output_valence)
# Definir el modelo LSTM
model = Sequential()
model.add(LSTM(50, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Usamos sigmoid ya que 'valence' parece ser binario

# Compilar el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Dividir los datos para la validación cruzada con TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []

# Validación cruzada para series temporales
for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Entrenar el modelo
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

    # Evaluar el modelo
    y_pred = (model.predict(X_test) > 0.5).astype('int32')
    score = accuracy_score(y_test, y_pred)
    cv_scores.append(score)

# Calcular y mostrar la precisión promedio de la validación cruzada
mean_cv_score = np.mean(cv_scores)
print(f'Precisión promedio de la validación cruzada: {mean_cv_score}')


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Precisión promedio de la validación cruzada: 0.5280716029292107


In [7]:
df_filtered.fillna(df_filtered.mean(), inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.fillna(df_filtered.mean(), inplace=True)


In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
import joblib  # Suponiendo que usas joblib para cargar el StandardScaler



# Cargar el nuevo conjunto de datos de prueba
test_df = pd.read_csv('test.csv')
test_df['date'] = pd.to_datetime(test_df['date'])

# Rellenar valores NaN excluyendo las columnas de fecha
numeric_columns = test_df.select_dtypes(include=['number']).columns
# Asegúrate de tener df_filtered o calcular las medias directamente de test_df para las columnas numéricas
test_df[numeric_columns] = test_df[numeric_columns].fillna(test_df[numeric_columns].mean())

# Definir la longitud de la secuencia
sequence_length = 7

# Función para crear secuencias, rellenando con ceros si es necesario
def create_sequences(features, sequence_length=7):
    sequences = []
    for i in range(len(features) - sequence_length + 1):
        sequences.append(features[i:(i + sequence_length)])
    return np.array(sequences)

# Preparar las secuencias de prueba, rellenando con ceros cuando no haya suficientes datos previos
input_sequences_test = []

for _, group in test_df.groupby('id'):
    group.sort_values('date', inplace=True)
    features = group.drop(['id', 'date'], axis=1)
    features_scaled = scaler.transform(features)
    
    # Si no hay suficientes filas para formar una secuencia, rellena con ceros
    if len(features_scaled) < sequence_length:
        padded_features = np.zeros((sequence_length, features_scaled.shape[1]))
        padded_features[-len(features_scaled):] = features_scaled
        features_scaled = padded_features
    
    sequences = create_sequences(features_scaled, sequence_length)
    input_sequences_test.append(sequences)

# Convertir listas a arrays de NumPy para hacer predicciones
X_test = np.concatenate(input_sequences_test) if input_sequences_test else np.array([])

# Hacer predicciones con el modelo LSTM
predicted_valence = model.predict(X_test).squeeze() if X_test.size else []  # Asegurarse de que X_test no esté vacío
predicted_valence = (predicted_valence > 0.5).astype(int) if predicted_valence.size else []

# Añadir las predicciones al DataFrame inicial, inicializando con 0
initial_test_df = pd.read_csv('test.csv')
initial_test_df['valence'] = 0  # Inicializar todas las predicciones como 0 por defecto

# Asignar las predicciones a los índices calculados
if len(predicted_valence) > 0:
    # Asignar las predicciones al final de cada grupo 'id'
    for i, (id, group) in enumerate(test_df.groupby('id')):
        if len(group) >= sequence_length:
            # Encuentra el índice del último elemento en el grupo dentro del DataFrame original
            last_index = group.index[-1]
            # Asigna la predicción correspondiente
            initial_test_df.at[last_index, 'valence'] = predicted_valence[i]

# Guardar las predicciones si es necesario
initial_test_df.to_csv('test_with_predictions.csv', index=False)

# Mostrar el DataFrame actualizado
print(initial_test_df)


# Cargar el archivo con las predicciones hechas previamente si lo tienes
test_prediction_df = pd.read_csv('test_prediction.csv')

# Combinar los DataFrame
df_merged = pd.merge(test_prediction_df, initial_test_df[['valence', 'date', 'id']], on=['date', 'id'], how='left')

# Mostrar el DataFrame actualizado
print(df_merged)



         id        date  is_weekend  steps_total  distance     sleep  \
0        12  2019-06-11           0    -0.345088       NaN       NaN   
1        12  2019-06-12           0    -0.038872       NaN -1.292573   
2        12  2019-06-13           0    -0.149744       NaN -0.592199   
3        12  2019-06-14           0     0.276213       NaN -1.281911   
4        12  2019-06-15           1    -0.363038       NaN -0.376331   
...     ...         ...         ...          ...       ...       ...   
26511  2690  2020-04-27           0    -0.143830 -0.361784       NaN   
26512  2690  2020-04-28           0    -0.349312 -0.365361       NaN   
26513  2690  2020-04-29           0          NaN       NaN -0.633403   
26514  2690  2020-04-30           0    -0.021767 -0.355604  0.505122   
26515  2690  2020-05-01           0    -0.413089 -0.381315  0.667768   

       app_usage  home_cluster  clusters_count   t_sport  practiced_sport  \
0            NaN           NaN             NaN -0.661383  

In [21]:
# Paso 1: Renombrar las columnas
df_merged.rename(columns={'id_kaggle': 'Id', 'valence': 'Category'}, inplace=True)

# Paso 2: Convertir y ajustar la columna 'Category'
# Convertir 'Category' a entero para eliminar cualquier decimal, luego a string
df_merged['Category'] = df_merged['Category'].astype(int).astype(str)
# Añadir comillas simples alrededor de los valores de 'Category'
df_merged['Category'] = "'" + df_merged['Category'] + "'"

# Paso 3: Seleccionar solo las columnas de interés (Id y Category)
df_final = df_merged[['Id', 'Category']]

# Paso 4: Guardar el DataFrame en un archivo CSV
df_final.to_csv('output_with_quotes.csv', index=False)



