In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

In [8]:
# Load data
train_data = pd.read_csv("dataset/problem1/Train_Data.csv")
test_data = pd.read_csv("dataset/problem1/Test_Data.csv")


In [9]:
# Preprocess target labels
def preprocess_target(df):
    le = LabelEncoder()
    df['Sepssis'] = le.fit_transform(df['Sepssis'])
    return df

train_data = preprocess_target(train_data)

In [10]:
# Separate features and target
X = train_data.drop('Sepssis', axis=1)
y = train_data['Sepssis']

In [11]:
# Split data into 75% train and 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
# Define preprocessing steps
numerical_features = train_data.drop(columns=['Sepssis']).select_dtypes(include=['number']).columns.tolist()
categorical_features = train_data.drop(columns=['Sepssis']).select_dtypes(include=['object']).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [13]:
# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [14]:
#Handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_processed, y_train)

In [24]:
# Model training
model = RandomForestClassifier(random_state=40)
model.fit(X_resampled, y_resampled)

In [25]:
# Predictions on the held out 25% test set.
y_pred = model.predict(X_test_processed)

# Predictions on the training set (75%)
y_train_pred = model.predict(X_resampled)

# Evaluate the model on the training set
f1_train = f1_score(y_resampled, y_train_pred)
print(f"F1 Score on the 75% training set: {f1_train}")

# Evaluate the model on the 25% test set
X_test_processed = preprocessor.transform(X_test)
y_test_pred = model.predict(X_test_processed)
f1_test = f1_score(y_test, y_test_pred)
print(f"F1 Score on the 25% test set: {f1_test}")


F1 Score on the 75% training set: 1.0
F1 Score on the 25% test set: 0.9963248657162473


In [27]:
#Preprocess full test data for submission.
final_predictions = model.predict(test_data)

# Create submission CSV
submission = pd.DataFrame({'Sepssis': final_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

Submission file created successfully.




In [28]:
submission = pd.read_csv('submission.csv')
submission['Sepssis'] = submission['Sepssis'].replace({0: 'Negative', 1: 'Positive'})
submission.to_csv('submission.csv', index=False)