In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import load_model

# Load and preprocess training data
data = pd.read_csv('train.csv')

# Handle missing values
numerical_columns = ['a', 'per_y', 'ad', 'data_arc']
for col in numerical_columns:
    if col in data.columns:
        data[col].fillna(data[col].mean(), inplace=True)

# Handle categorical columns
categorical_columns = ['condition_code', 'neo', 'pha']
for col in categorical_columns:
    if col in data.columns:
        data[col].fillna(data[col].mode()[0], inplace=True)

# Encode categorical variables
if 'neo' in data.columns:
    data['neo'] = data['neo'].map({'Y': 1, 'N': 0})

if 'pha' in data.columns:
    data['pha'] = data['pha'].map({'Y': 1, 'N': 0})

# Drop unnecessary columns
columns_to_drop = ['name', 'condition_code', 'pha']
data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)
data.dropna(inplace=True)

# Handle correlation matrix and feature selection
correlation_matrix = data.corr()
threshold = 0.9
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
data_reduced = data.drop(columns=to_drop)

# Split data into features and target
X = data_reduced.drop(columns=['neo'])
y = data_reduced['neo']

# Split the data into training and validation sets
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_cv = scaler.transform(X_cv)

# Define and train the model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_cv, y_cv, verbose=0)
print(f"Validation Accuracy: {accuracy:.4f}")
model.save('my_model.h5')

# Load and preprocess test data
test_data = pd.read_csv('test.csv')

# Handle missing values
for col in numerical_columns:
    if col in test_data.columns:
        test_data[col].fillna(test_data[col].mean(), inplace=True)

for col in categorical_columns:
    if col in test_data.columns:
        test_data[col].fillna(test_data[col].mode()[0], inplace=True)

# Drop unnecessary columns
test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns], inplace=True)

# Handle 'neo' column and alignment with training data
if 'neo' in test_data.columns:
    y_test = test_data['neo'].map({'Y': 1, 'N': 0})
    X_test = test_data.drop(columns=['neo'])

    # Ensure X_test has the same columns as X_train
    X_test = pd.DataFrame(X_test)  # Convert to DataFrame
    X_test = X_test.reindex(columns=X.columns, fill_value=0)  # Align with X_train columns

    # Debugging step: Print columns to verify alignment
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("X_train columns:", X.columns)
    print("X_test columns:", X_test.columns)

    # Scale features
    X_test = scaler.transform(X_test)

    # Load the trained model
    model = load_model('my_model.h5')

    # Predict and evaluate the model
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
else:
    print("The 'neo' column is not present in the test data.")


  data = pd.read_csv('train.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9157 - loss: 0.3322 - val_accuracy: 0.9796 - val_loss: 0.0564
Epoch 2/50
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9846 - loss: 0.0703 - val_accuracy: 0.9944 - val_loss: 0.0265
Epoch 3/50
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9876 - loss: 0.0413 - val_accuracy: 0.9969 - val_loss: 0.0179
Epoch 4/50
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9914 - loss: 0.0315 - val_accuracy: 0.9969 - val_loss: 0.0149
Epoch 5/50
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9931 - loss: 0.0247 - val_accuracy: 0.9969 - val_loss: 0.0137
Epoch 6/50
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9946 - loss: 0.0169 - val_accuracy: 0.9969 - val_loss: 0.0126
Epoch 7/50
[1m202/202[0m [32m━━━━━━━



Validation Accuracy: 0.9990
X_train shape: (8077, 11)
X_test shape: (82328, 11)
X_train columns: Index(['a', 'e', 'i', 'om', 'w', 'q', 'data_arc', 'H', 'diameter', 'albedo',
       'rot_per'],
      dtype='object')
X_test columns: Index(['a', 'e', 'i', 'om', 'w', 'q', 'data_arc', 'H', 'diameter', 'albedo',
       'rot_per'],
      dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].mode()[0], inplace=True)


[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 634us/step
Test Accuracy: 0.9746
