In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

In [4]:
# Load the training data
x_train_df = pd.read_csv('X_train.csv')
y_train_df = pd.read_csv('y_train.csv')
x_test_df = pd.read_csv('X_test.csv')

x_train = x_train_df.drop(columns=['id'])
x_test = x_test_df.drop(columns=['id'])
y_train = y_train_df['y']

In [5]:
# Drop columns with constant value
constant_columns = x_train.apply(lambda col: col.nunique(dropna=True) == 1)
x_train = x_train.loc[:, ~constant_columns]
x_test = x_test.loc[:, ~constant_columns]

# Outlier detection
# Z-scores
mean = np.nanmean(x_train, axis=0)
std_dev = np.nanstd(x_train, axis=0)
z_scores = np.abs((x_train - mean) / std_dev)
threshold = 3.5
outliers = (z_scores > threshold).any(axis=1)
x_train_filtered = x_train[~outliers]
y_train_filtered = y_train[~outliers]

print("Number of rows removed: ", outliers.value_counts().get(True))

Number of rows removed:  362


In [6]:
# Imputation of missing values
imputer = KNNImputer()
x_train_imputed = imputer.fit_transform(x_train_filtered)
x_test_imputed = imputer.fit_transform(x_test)

# Scale the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_imputed)
x_train_scaled = pd.DataFrame(x_train_scaled)
x_test_scaled = scaler.fit_transform(x_test_imputed)
x_test_scaled = pd.DataFrame(x_test_scaled)

In [7]:
# Feature selection
# # Compute correlation matrix
# correlation_matrix = x_train_scaled.corrwith(y_train_filtered)
# # Select features with high correlation (e.g., absolute correlation > 0.1)
# selected_features = correlation_matrix[abs(correlation_matrix) > 0.1].index
# # Keep only the selected features in x_train
# x_train_selected = x_train_scaled[selected_features]
# x_test_selected = x_test_scaled[selected_features]

# print("Number of features selected: ", selected_features.size)

from sklearn.feature_selection import SelectKBest, mutual_info_regression

#Select top 100 features with highest mutual information
selection = SelectKBest(mutual_info_regression, k=100).fit(x_train_scaled, y_train_filtered)
x_train_selected = selection.transform(x_train_scaled)
x_test_selected = selection.transform(x_test_scaled)

In [8]:
# Build the model
model = Sequential()
model.add(Input(shape=(x_train_selected.shape[-1], )))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

print(x_train_selected.shape, y_train_filtered.shape)

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
history = model.fit(x_train_selected, y_train_filtered, epochs=30, batch_size=32)

(850, 100) (850,)
Epoch 1/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 939us/step - loss: 4678.4536 - mae: 67.7933
Epoch 2/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 789us/step - loss: 2903.1211 - mae: 52.4643
Epoch 3/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 739us/step - loss: 1370.2753 - mae: 32.2453
Epoch 4/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 697us/step - loss: 1063.5314 - mae: 27.0243
Epoch 5/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step - loss: 687.8041 - mae: 21.4129
Epoch 6/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 685us/step - loss: 484.4777 - mae: 17.4970
Epoch 7/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 742us/step - loss: 369.0187 - mae: 14.9450
Epoch 8/30
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 757us/step - loss: 324.0789 - mae: 13.8062
Epoch 9/30
[1m27/27[0m [32m━━━━

In [9]:
# Evaluate the model on the training data itself
y_train_pred = model.predict(x_train_selected)
r2_train = r2_score(y_train_filtered, y_train_pred)
# Print R^2 score for the training set
print(f"Training R² Score: {r2_train}")

# Use the trained model to predict the ages
age_predictions = model.predict(x_test_selected)
# Convert predictions to a pandas DataFrame
predictions_df = pd.DataFrame({
    'id': range(len(age_predictions)),  # Assign IDs starting from 0
    'y': age_predictions.flatten()  # Flatten if predictions are in a 2D array
})
# Save the DataFrame to a CSV file
predictions_df.to_csv('predictions_MLP.csv', index=False)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 898us/step
Training R² Score: 0.6842039436102283
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397us/step


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly_reg_model = make_pipeline(PolynomialFeatures(degree=1), LinearRegression())
poly_reg_model.fit(x_train_selected, y_train_filtered)
y_train_pred = poly_reg_model.predict(x_train_selected)

r2_train = r2_score(y_train_filtered, y_train_pred)
print(f"Training R² Score: {r2_train}")

# Use the trained model to predict the ages
age_predictions = poly_reg_model.predict(x_test_selected)
# Convert predictions to a pandas DataFrame
predictions_df = pd.DataFrame({
    'id': range(len(age_predictions)),  # Assign IDs starting from 0
    'y': age_predictions.flatten()  # Flatten if predictions are in a 2D array
})
# Save the DataFrame to a CSV file
predictions_df.to_csv('predictions_LR.csv', index=False)

Training R² Score: 0.5265455975403097
