In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# loading datasets
train_df = pd.read_csv('train_final.csv')
test_df = pd.read_csv('test_final.csv')

In [3]:
# Check for '?' and NaN values in each column for both train and test sets
def check_missing_values(df, name):
    print(f"\n--- {name} Data ---")
    nan_counts = df.isnull().sum()  # Counts of NaN values
    question_mark_counts = (df == '?').sum()  # Counts of '?' values

    # Combine the counts into a single DataFrame for easy comparison
    missing_value_summary = pd.DataFrame({
        'NaN Count': nan_counts,
        '? Count': question_mark_counts
    })

    print(missing_value_summary)

In [4]:
# Check missing values in train and test data - only ? should be there
check_missing_values(train_df, 'Train')
check_missing_values(test_df, 'Test')


--- Train Data ---
                NaN Count  ? Count
age                     0        0
workclass               0     1437
fnlwgt                  0        0
education               0        0
education.num           0        0
marital.status          0        0
occupation              0     1442
relationship            0        0
race                    0        0
sex                     0        0
capital.gain            0        0
capital.loss            0        0
hours.per.week          0        0
native.country          0      427
income>50K              0        0

--- Test Data ---
                NaN Count  ? Count
ID                      0        0
age                     0        0
workclass               0     1362
fnlwgt                  0        0
education               0        0
education.num           0        0
marital.status          0        0
occupation              0     1367
relationship            0        0
race                    0        0
sex             

In [5]:
# Replace '?' with NaN to handle missing values
train_df.replace('?', pd.NA, inplace=True)
test_df.replace('?', pd.NA, inplace=True)

In [6]:
# Check missing values in train and test data - only nan should be there
check_missing_values(train_df, 'Train')
check_missing_values(test_df, 'Test')


--- Train Data ---
                NaN Count  ? Count
age                     0        0
workclass            1437        0
fnlwgt                  0        0
education               0        0
education.num           0        0
marital.status          0        0
occupation           1442        0
relationship            0        0
race                    0        0
sex                     0        0
capital.gain            0        0
capital.loss            0        0
hours.per.week          0        0
native.country        427        0
income>50K              0        0

--- Test Data ---
                NaN Count  ? Count
ID                      0        0
age                     0        0
workclass            1362        0
fnlwgt                  0        0
education               0        0
education.num           0        0
marital.status          0        0
occupation           1367        0
relationship            0        0
race                    0        0
sex             

In [7]:
# Separate target variable and features
X_train = train_df.drop(columns='income>50K')  
y_train = train_df['income>50K']              
X_test = test_df.copy()                   

In [8]:
# categorical and numerical (i.e. continous) columns
numerical_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

In [9]:
# handling numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [10]:
# handling categorical columns

# Ensure all categorical columns, including 'sex', are treated as strings
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_test[categorical_features] = X_test[categorical_features].astype(str)

# Count the unique values in each categorical column
unique_category_counts = X_train[categorical_features].nunique()

# Set a threshold for low vs. high cardinality
threshold = 15
low_cardinality_cols = unique_category_counts[unique_category_counts <= threshold].index.tolist()
high_cardinality_cols = unique_category_counts[unique_category_counts > threshold].index.tolist()

In [11]:
print(low_cardinality_cols)
print(high_cardinality_cols)

['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex']
['education', 'native.country']


In [12]:
# Handle low cardinality categorical features (OneHotEncoder)
#for now both are one-hot encoded let's work on it by uncommenting the ordinal code
categorical_transformer_low = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_high = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))  # Handle unknown categories
])

# Combine the preprocessing steps into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat_low', categorical_transformer_low, low_cardinality_cols),
        ('cat_high', categorical_transformer_high, high_cardinality_cols)
    ]
)

# Apply the preprocessing pipeline to the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [13]:
# Logistic Regression Model
logreg = LogisticRegression(random_state=0)

# Training Logistic Regression
logreg.fit(X_train_transformed, y_train)

# Predicting
y_train_pred_logreg = logreg.predict_proba(X_train_transformed)[:, 1]  # Probability for class 1 (income > 50K)

# Evaluate Logistic Regression Model using Area Under ROC (AUC) curve
logreg_auc = roc_auc_score(y_train, y_train_pred_logreg)
print(f"Logistic Regression AUC: {logreg_auc:.4f}")

# Generate predictions on the test data using Logistic Regression Model
y_test_pred_logreg = logreg.predict_proba(X_test_transformed)[:, 1]

Logistic Regression AUC: 0.9075


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
if 'Id' in test_df.columns:
    test_ids = test_df['Id']
else:
    test_ids = test_df.index + 1 

# Create submission dataframe for Logistic Regression
submission_logreg = pd.DataFrame({
    'ID': test_ids,  # Use 'Id' from the test set or adjusted index if missing
    'Prediction': y_test_pred_logreg
})

submission_file_name = 'submission_logreg.csv'

# Save predictions to CSV for submission in the correct format
submission_logreg.to_csv(submission_file_name, index=False)
print(f"Logistic Regression submission saved as {submission_file_name}")

Logistic Regression submission saved as submission_logreg.csv


In [15]:
# Decision Tree Model 
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=10, min_samples_leaf=5)

# Train Decision Tree
decision_tree.fit(X_train_transformed, y_train)

# Predicting
y_train_pred_tree = decision_tree.predict_proba(X_train_transformed)[:, 1] 

# Evaluate Decision Tree Model using Area Under ROC (AUC) curve
tree_auc = roc_auc_score(y_train, y_train_pred_tree)
print(f"Decision Tree AUC: {tree_auc:.4f}")

# Generate predictions on the test data using Decision Tree Model
y_test_pred_tree = decision_tree.predict_proba(X_test_transformed)[:, 1]

Decision Tree AUC: 0.9273


In [16]:
# Ensure 'Id' column exists in test set for submission
if 'Id' in test_df.columns:
    test_ids = test_df['Id']
else:
    test_ids = test_df.index + 1  
    
# Create submission dataframe for Decision Tree
submission_tree = pd.DataFrame({
    'ID': test_ids,  # Use 'Id' from the test set or adjusted index if missing
    'Prediction': y_test_pred_tree
})

submission_file_name = 'submission_tree.csv'

# Save predictions to CSV for submission in the correct format
submission_tree.to_csv(submission_file_name, index=False)

print(f"Decision Tree submission saved as {submission_file_name}")

Decision Tree submission saved as submission_tree.csv
