In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Load datasets
def load_data(file_paths):
    return [pd.read_csv(path).rename(columns=lambda x: x.strip()) for path in file_paths]

# Preprocess data: merge, handle missing values, encode labels, and scale
def preprocess_data(train_data, test_data):
    # Merge datasets
    train_data = pd.merge(train_data[0], train_data[1], on='ID', how='inner')
    test_data = pd.merge(test_data[0], test_data[1], on='ID', how='inner')

    # Handle missing values
    numeric_cols = train_data.select_dtypes(include=[np.number]).columns.difference(['Overall_Experience'])
    categorical_cols = train_data.select_dtypes(exclude=[np.number]).columns

    train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].mean())
    test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())

    train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])
    test_data[categorical_cols] = test_data[categorical_cols].fillna(test_data[categorical_cols].mode().iloc[0])

    # Label encoding
    label_encoder = LabelEncoder()
    for col in categorical_cols:
        train_data[col] = label_encoder.fit_transform(train_data[col])
        test_data[col] = label_encoder.transform(test_data[col])

    # Split features and target
    X_train = train_data.drop(columns=['ID', 'Overall_Experience'])
    y_train = train_data['Overall_Experience']
    X_test = test_data.drop(columns=['ID'])

    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, y_train, X_test

# Train Logistic Regression model
def train_lr_model(X_train, y_train):
    lr_model = LogisticRegression(max_iter=1000, random_state=42)
    lr_model.fit(X_train, y_train)
    return lr_model

# Main function
def main():
    # Step 1: Load data
    file_paths_train = ['C:/Users/bigba/Downloads/mit/hackathons/Traveldata_train.csv', 
                        'C:/Users/bigba/Downloads/mit/hackathons/Surveydata_train.csv']
    file_paths_test = ['C:/Users/bigba/Downloads/mit/hackathons/Traveldata_test.csv', 
                       'C:/Users/bigba/Downloads/mit/hackathons/Surveydata_test.csv']
    
    train_data = load_data(file_paths_train)
    test_data = load_data(file_paths_test)

    # Step 2: Preprocess data
    X_train, y_train, X_test = preprocess_data(train_data, test_data)

    # Step 3: Handle class imbalance using SMOTE
    smote = SMOTE(sampling_strategy=1.0, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Step 4: Train Logistic Regression model
    lr_model = train_lr_model(X_train_resampled, y_train_resampled)

    # Step 5: Evaluate the model
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42)
    y_val_pred = lr_model.predict(X_val_split)
    val_accuracy = accuracy_score(y_val_split, y_val_pred)
    print(f"Logistic Regression Validation Accuracy: {val_accuracy:.4f}")

    if val_accuracy >= 0.97:
        print("Model meets the desired accuracy!")
    else:
        print("Model does not meet the desired accuracy.")

    # Step 6: Make predictions on test data and create submission file
    y_pred = lr_model.predict(X_test)
    submission = pd.DataFrame({'ID': test_data[0]['ID'], 'Overall_Experience': y_pred})
    submission.to_csv('submission.csv', index=False)
    print("Submission file created successfully!")

# Call main function directly
main()


Fitting 3 folds for each of 16 candidates, totalling 48 fits


BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.