In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-2024-f/train_final.csv
/kaggle/input/ml-2024-f/test_final.csv


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
train_data = pd.read_csv('/kaggle/input/ml-2024-f/train_final.csv')
test_data = pd.read_csv('/kaggle/input/ml-2024-f/test_final.csv')

# Replace '?' with NaN
train_data.replace('?', pd.NA, inplace=True)
test_data.replace('?', pd.NA, inplace=True)

# Separate target variable and drop it from the training data
X_train = train_data.drop(columns=['income>50K'])
y_train = train_data['income>50K']
X_test = test_data.drop(columns=['ID'])

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(exclude=['object']).columns

# Define transformers for preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Impute missing values in categorical columns using the mode (most frequent value)
for col in ['workclass', 'occupation', 'native.country']:
    X_train[col].fillna(X_train[col].mode()[0], inplace=True)
    X_test[col].fillna(X_test[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(X_train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(X_test[col].mode()[0], inplace=True)


In [3]:
# Define a pipeline with the preprocessor and a logistic regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(max_iter=1000, random_state=42))])

# Split data for validation

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



# Hyperparameter tuning

param_grid = {

    'classifier__C': [0.01, 0.1, 1, 10, 100],

    'classifier__penalty': ['l1', 'l2'],

    'classifier__solver': ['liblinear']  # 'liblinear' is suitable for small datasets

}



grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')

grid_search.fit(X_train_split, y_train_split)



# Evaluate on validation set

best_model = grid_search.best_estimator_

y_val_pred_proba = best_model.predict_proba(X_val_split)[:, 1]

val_auc = roc_auc_score(y_val_split, y_val_pred_proba)

print("Validation AUC:", val_auc)



# Final prediction on the test set

final_predictions = best_model.predict_proba(X_test)[:, 1]



# Prepare the submission file

submission = pd.DataFrame({

    'ID': test_data['ID'],

    'Prediction': final_predictions

})



# Save to CSV

submission.to_csv("submission.csv", index=False)

Validation AUC: 0.9052592480672861


In [4]:
submission

Unnamed: 0,ID,Prediction
0,1,0.146108
1,2,0.026350
2,3,0.102759
3,4,0.134195
4,5,0.032226
...,...,...
23837,23838,0.439187
23838,23839,0.046894
23839,23840,0.998730
23840,23841,0.121975


In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
train_data = pd.read_csv('/kaggle/input/ml-2024-f/train_final.csv')
test_data = pd.read_csv('/kaggle/input/ml-2024-f/test_final.csv')

# Replace '?' with NaN
train_data.replace('?', pd.NA, inplace=True)
test_data.replace('?', pd.NA, inplace=True)

# Separate target variable and drop it from the training data
X_train = train_data.drop(columns=['income>50K'])
y_train = train_data['income>50K']
X_test = test_data.drop(columns=['ID'])

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(exclude=['object']).columns

# Define transformers for preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Impute missing values in categorical columns using the mode (most frequent value)
for col in ['workclass', 'occupation', 'native.country']:
    X_train[col].fillna(X_train[col].mode()[0], inplace=True)
    X_test[col].fillna(X_test[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(X_train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(X_test[col].mode()[0], inplace=True)


In [6]:
# Define and train the Random Forest Classifier
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier(random_state=42))])

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Predict probabilities for the test data
rf_predictions = rf_model.predict_proba(X_test)[:, 1]

In [7]:
# Evaluate model performance on the training set using AUC
rf_train_predictions = rf_model.predict_proba(X_train)[:, 1]
rf_train_auc = roc_auc_score(y_train, rf_train_predictions)
print(f"Random Forest AUC on training set: {rf_train_auc:.4f}")

# Prepare the submission file
submission_rf = pd.DataFrame({
    'ID': test_data['ID'],
    'Prediction': rf_predictions
})

# Save to CSV
submission_rf.to_csv("submission_rf.csv", index=False)

# Now, let's implement Gradient Boosting Classifier
gb_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', GradientBoostingClassifier(random_state=42))])

# Train the Gradient Boosting model
gb_model.fit(X_train, y_train)

# Predict probabilities for the test data
gb_predictions = gb_model.predict_proba(X_test)[:, 1]

# Evaluate model performance on the training set using AUC
gb_train_predictions = gb_model.predict_proba(X_train)[:, 1]
gb_train_auc = roc_auc_score(y_train, gb_train_predictions)
print(f"Gradient Boosting AUC on training set: {gb_train_auc:.4f}")

# Prepare the submission file
submission_gb = pd.DataFrame({
    'ID': test_data['ID'],
    'Prediction': gb_predictions
})

# Save to CSV
submission_gb.to_csv("submission_gb.csv", index=False)


Random Forest AUC on training set: 1.0000
Gradient Boosting AUC on training set: 0.9257


In [8]:
submission_rf

Unnamed: 0,ID,Prediction
0,1,0.06
1,2,0.00
2,3,0.07
3,4,0.32
4,5,0.01
...,...,...
23837,23838,0.30
23838,23839,0.07
23839,23840,0.87
23840,23841,0.24


In [9]:
submission_gb

Unnamed: 0,ID,Prediction
0,1,0.170369
1,2,0.020414
2,3,0.152978
3,4,0.133510
4,5,0.051501
...,...,...
23837,23838,0.226249
23838,23839,0.024682
23839,23840,0.953567
23840,23841,0.088445


In [10]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])

# Convert categorical features to numerical
X_train_encoded = pd.get_dummies(X_train[categorical_cols])
X_test_encoded = pd.get_dummies(X_test[categorical_cols])

# Ensure the same columns in train and test
X_train_final, X_test_final = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

# Combine scaled numerical features with encoded categorical features
X_train_final = pd.concat([pd.DataFrame(X_train_scaled), X_train_final.reset_index(drop=True)], axis=1)
X_test_final = pd.concat([pd.DataFrame(X_test_scaled), X_test_final.reset_index(drop=True)], axis=1)

# Build a simple neural network
model_nn = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_final.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
])

model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

# Train the neural network
model_nn.fit(X_train_final, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Predict probabilities for the test data
nn_predictions = model_nn.predict(X_test_final).flatten()

# Prepare the submission file
submission_nn = pd.DataFrame({
    'ID': test_data['ID'],
    'Prediction': nn_predictions
})

# Save to CSV
submission_nn.to_csv("submission_nn.csv", index=False)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - AUC: 0.8340 - loss: 0.3998 - val_AUC: 0.9059 - val_loss: 0.3186
Epoch 2/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9162 - loss: 0.3046 - val_AUC: 0.9081 - val_loss: 0.3163
Epoch 3/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9186 - loss: 0.3011 - val_AUC: 0.9105 - val_loss: 0.3126
Epoch 4/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9212 - loss: 0.2938 - val_AUC: 0.9092 - val_loss: 0.3148
Epoch 5/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9207 - loss: 0.2920 - val_AUC: 0.9097 - val_loss: 0.3148
Epoch 6/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9231 - loss: 0.2916 - val_AUC: 0.9106 - val_loss: 0.3141
Epoch 7/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9271

In [11]:
submission_nn

Unnamed: 0,ID,Prediction
0,1,0.209900
1,2,0.012088
2,3,0.117564
3,4,0.359554
4,5,0.010333
...,...,...
23837,23838,0.591226
23838,23839,0.012805
23839,23840,1.000000
23840,23841,0.536981
