In [None]:
# Bibliotecas Gerais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Carregar dados
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Remover colunas 'id'
train_data = train_data.drop(columns=['id'])


In [None]:

# Função para dividir o nome do arquivo e extrair informações
def extract_features_from_filename(filename):
    parts = filename.split('_')

    def clean_coordinate(coordinate):
        """Remove hyphens and convert to float if possible."""
        clean_coord = coordinate.replace('-', '')
        try:
            return float(clean_coord)
        except ValueError:
            return clean_coord

    #print(parts)

    if len(parts) == 11:
        coordinates_id1 = clean_coordinate(parts[4])
        coordinates_id2 = clean_coordinate(parts[5])
        date_time_acquisition_start = parts[8].replace('T', '')
        date_time_acquisition_end = parts[9].replace('T', '')
        return pd.Series([coordinates_id1, coordinates_id2, date_time_acquisition_start, date_time_acquisition_end], 
                         index=['coordinates_id1', 'coordinates_id2', 'date_time_acquisition_start', 'date_time_acquisition_end'])

    if len(parts) == 10:
        coordinates_id1 = clean_coordinate(parts[3])
        coordinates_id2 = clean_coordinate(parts[4])
        date_time_acquisition_start = parts[7].replace('T', '')
        date_time_acquisition_end = parts[8].replace('T', '')
        return pd.Series([coordinates_id1, coordinates_id2, date_time_acquisition_start, date_time_acquisition_end], 
                         index=['coordinates_id1', 'coordinates_id2', 'date_time_acquisition_start', 'date_time_acquisition_end'])

    if len(parts) == 9:
        coordinates_id1 = clean_coordinate(parts[2])
        coordinates_id2 = clean_coordinate(parts[3])
        date_time_acquisition_start = parts[6].replace('T', '')
        date_time_acquisition_end = parts[7].replace('T', '')
        return pd.Series([coordinates_id1, coordinates_id2, date_time_acquisition_start, date_time_acquisition_end], 
                         index=['coordinates_id1', 'coordinates_id2', 'date_time_acquisition_start', 'date_time_acquisition_end'])

    if len(parts) == 8:
        coordinates_id1 = clean_coordinate(parts[1])
        coordinates_id2 = clean_coordinate(parts[2])
        date_time_acquisition_start = parts[5].replace('T', '')
        date_time_acquisition_end = parts[6].replace('T', '')
        return pd.Series([coordinates_id1, coordinates_id2, date_time_acquisition_start, date_time_acquisition_end], 
                         index=['coordinates_id1', 'coordinates_id2', 'date_time_acquisition_start', 'date_time_acquisition_end'])

    return pd.Series([None]*4, index=['coordinates_id1', 'coordinates_id2', 'date_time_acquisition_start', 'date_time_acquisition_end'])

# Aplicar a função a cada nome de arquivo no dataset
file_features = train_data['file_name_l1'].apply(extract_features_from_filename)
file_features_test = test_data['file_name_l1'].apply(extract_features_from_filename)

In [None]:


# Concatenar as novas features ao dataframe original


#train_data = pd.concat([train_data, file_features], axis=1)
#test_data = pd.concat([test_data, file_features_test], axis=1)

# Remover colunas desnecessárias para o modelo
train_data = train_data.drop(columns=['file_name_l1'])
test_data = test_data.drop(columns=['file_name_l1'])

# Visualizar os dados com as novas features
#print(train_data.head)

In [None]:

# Estatísticas descritivas
print("Estatísticas Descritivas do Dataset COMPLETO de Treino:")
print(train_data.describe())


print("\n")
# Verificar valores nulos
print("Verificar Missing Data:")
print(train_data.isnull().sum())

In [None]:
# Graficos e Visualizações

#Visualizar a distribuição normal das features
df_analise_dist = train_data.melt()

#Criar um FaceGrit com um histograma para cada feature do DataSet
g = sns.FacetGrid(df_analise_dist, col="variable", col_wrap=4, sharex=False, sharey=False, height=4)
g.map(sns.histplot, "value", kde=False, color='blue', bins=30)
plt.show()


#Visualizar a distribuição de outliers
df_analise_box_plot = train_data.melt()

#FaceGrit com os box Plot
g = sns.FacetGrid(df_analise_box_plot, col="variable", col_wrap=4, sharex=False, sharey=False, height=4)
g.map(sns.boxplot, "value")
plt.show()
        

# Visualizações (dependendo do tipo de dados, ajuste as visualizações)
sns.pairplot(train_data)
plt.show()

In [None]:
###################################

# Feature engineering (if needed)
# Example: create a new feature 'elevation_squared'

#train_data['elevation_squared'] = train_data['elevation'] ** 2
#test_data['elevation_squared'] = test_data['elevation'] ** 2

print(train_data.head())
print(test_data.head())
###########################################


# Train e Validation Set
X = train_data.drop(columns=['value_550'])
y = train_data['value_550']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=10)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
# Model definition with regularization, batch normalization, and dropout
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split, KFold

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def create_model():
    model = Sequential([
        Dense(1280, activation='relu', kernel_regularizer=l2(0.0001), input_shape=(X_train_scaled.shape[1],)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(640, activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(320, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.2),
        Dense(1)  # Output layer for regression
    ])

    # Compile the model with a lower learning rate
    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])
    
    return model


# Define the KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store validation results
val_mae_scores = []

for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Create a new model instance
    model = create_model()

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train, y_train, epochs=300, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=1
                        )
    
    # Print model summary
    model.summary()
    
    # Evaluate the model on the validation set
    val_loss, val_mae = model.evaluate(X_val, y_val, verbose=0)
    val_mae_scores.append(val_mae)
    print(f"Fold Validation MAE: {val_mae}")

# Calculate the mean and standard deviation of the validation MAE scores
mean_val_mae = np.mean(val_mae_scores)
std_val_mae = np.std(val_mae_scores)

print(f"\nMean Validation MAE: {mean_val_mae}")
print(f"Standard Deviation of Validation MAE: {std_val_mae}")

In [None]:
# Optionally, you can plot the training history
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation MAE values
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('Model MAE')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

In [None]:
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam, SGD
import numpy as np

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the function to create your Keras model
def create_model(optimizer='adam', 
                 kernel_regularizer_1=0.0001, 
                 kernel_regularizer_2=0.001, 
                 kernel_regularizer_3=0.01, 
                 dropout_rate_1=0.5, 
                 dropout_rate_2=0.5, 
                 dropout_rate_3=0.2):
    model = Sequential([
        Dense(1280, activation='relu', kernel_regularizer=l2(kernel_regularizer_1), input_shape=(X_scaled.shape[1],)),
        BatchNormalization(),
        Dropout(dropout_rate_1),
        Dense(640, activation='relu', kernel_regularizer=l2(kernel_regularizer_2)),
        BatchNormalization(),
        Dropout(dropout_rate_2),
        Dense(320, activation='relu', kernel_regularizer=l2(kernel_regularizer_3)),
        BatchNormalization(),
        Dropout(dropout_rate_3),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])
    return model

# Create a KerasRegressor based on your Keras model
model = KerasRegressor(model=create_model, epochs=300, batch_size=32, verbose=0)

# Define the grid search parameters with correct prefixes
param_grid = {
    'model__optimizer': ['adam', 'sgd'],
    'model__kernel_regularizer_1': [0.0001, 0.001, 0.01],
    'model__kernel_regularizer_2': [0.0001, 0.001, 0.01],
    'model__kernel_regularizer_3': [0.0001, 0.001, 0.01],
    'model__dropout_rate_1': [0.3, 0.5, 0.7],
    'model__dropout_rate_2': [0.3, 0.5, 0.7],
    'model__dropout_rate_3': [0.2, 0.4, 0.6]
}

# Create GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit GridSearchCV
grid_result = grid_search.fit(X_scaled, y)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


In [None]:

# Make predictions on a clean test set
test_data_cleaned = test_data.drop(columns=['id'])
 
predictions = model.predict(test_data_cleaned)

# Prepare submission DataFrame
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'value_550': predictions.flatten()
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)

print("Submission file created successfully.")