## Attempt 1: Pre-processing the data (parsing dates, age info converted into days, encoding categorical features, handling missing values using median for numeric and most common for categorical). Ensemble method that combines RandomForest, XGBoost, and MLP. Hyperparameter tuning using CV. Evaluation metrics: accuracy and log loss.

test score: .39


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier
import scipy.stats as st

# === Preprocessing Function ===
def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Function to convert age strings to days
    def parse_age(age_str):
        if pd.isna(age_str):
            return np.nan
        try:
            num = int(''.join(filter(str.isdigit, str(age_str))))
            unit = ''.join(filter(str.isalpha, str(age_str).lower()))
            if 'year' in unit:
                return num * 365
            elif 'month' in unit:
                return num * 30
            elif 'week' in unit:
                return num * 7
            elif 'day' in unit:
                return num
            return np.nan
        except:
            return np.nan

    df['Age in Days'] = df.get('Age upon Intake', '').apply(parse_age)

    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract('([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories in these columns
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if is_train and 'Outcome Type' in df.columns else None

    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# === Load Data ===
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# === Preprocess Data ===
X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Remove 'Length of Stay' from training if missing in test set
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.loc[:, X.columns]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Extra safety: drop all-NaN columns
X_train.dropna(axis=1, how='all', inplace=True)
X_val.dropna(axis=1, how='all', inplace=True)
X_test.dropna(axis=1, how='all', inplace=True)

# === Define Feature Lists for Preprocessing ===
numeric_features = [
    'Age in Days', 'Intake Year', 'Intake Month', 'Intake Day',
    'Intake Weekday', 'Intake Hour', 'Color Count'
]
categorical_features = [
    'Intake Type', 'Intake Condition', 'Animal Type', 'Sex',
    'Primary Color', 'Secondary Color'
]
if 'Length of Stay' in X_train.columns:
    numeric_features.append('Length of Stay')

numeric_features = [f for f in numeric_features if f in X_train.columns]
categorical_features = [f for f in categorical_features if f in X_train.columns]

# === Build Preprocessing Pipeline ===
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_features)
])

# === Build Combined VotingClassifier Pipeline ===
# This VotingClassifier combines three models: RandomForest, XGBoost, and MLP.
voting_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42, n_jobs=-1)),
            ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)),
            ('mlp', MLPClassifier(random_state=42))
        ],
        voting='soft', n_jobs=-1
    ))
])

# === Define a Reduced Hyperparameter Space for Faster Tuning ===
param_grid = {
    'classifier__rf__n_estimators': [100, 150],
    'classifier__rf__max_depth': [12],
    'classifier__xgb__n_estimators': [100, 150],
    'classifier__xgb__max_depth': [4],
    'classifier__xgb__learning_rate': [0.1],
    'classifier__mlp__hidden_layer_sizes': [(100,), (100, 50)],
    'classifier__mlp__alpha': [0.0001],
    'classifier__mlp__activation': ['relu'],
    'classifier__mlp__max_iter': [300]
}

# Use RandomizedSearchCV for a faster search (20 iterations)
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(voting_clf, param_distributions=param_grid,
                                   n_iter=20, cv=3, scoring='accuracy',
                                   n_jobs=-1, verbose=1, random_state=42)
print("Tuning the Combined VotingClassifier (RandomizedSearchCV)...")
random_search.fit(X_train, y_train)

# === Evaluate on the Validation Set ===
best_model = random_search.best_estimator_
val_pred = best_model.predict(X_val)
val_proba = best_model.predict_proba(X_val)

print("\n--- Validation Results ---")
print("Best Parameters:", random_search.best_params_)
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"Validation Log Loss: {log_loss(y_val, val_proba):.4f}")

# === Retrain the Best Model on Full Training Data ===
best_model.fit(X, y_encoded)
test_pred = best_model.predict(X_test)

# === Save Final Submission ===
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': le.inverse_transform(test_pred)
})
submission.to_csv('submission_combined_optimized.csv', index=False)
print("\n📄 Submission file saved as 'submission_combined_optimized.csv'")
print(submission.head())


  df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
  df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
  df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')


Tuning the Combined VotingClassifier (RandomizedSearchCV)...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

--- Validation Results ---
Best Parameters: {'classifier__xgb__n_estimators': 150, 'classifier__xgb__max_depth': 4, 'classifier__xgb__learning_rate': 0.1, 'classifier__rf__n_estimators': 150, 'classifier__rf__max_depth': 12, 'classifier__mlp__max_iter': 300, 'classifier__mlp__hidden_layer_sizes': (100, 50), 'classifier__mlp__alpha': 0.0001, 'classifier__mlp__activation': 'relu'}
Validation Accuracy: 0.6376
Validation Log Loss: 0.8766





📄 Submission file saved as 'submission_optimized.csv'
   Id     Outcome Type
0   1  Return to Owner
1   2         Transfer
2   3  Return to Owner
3   4         Adoption
4   5         Transfer


# Attempt 2
Pre-processing the data (parsing dates, age info converted into days, encoding categorical features, handling missing values using median for numeric and most common for categorical).
Ensmeble method with Stacking Classifier using Logistic Regression as base model, Random Forest Classifier, Gradient Boosting Classifier.
Hyperparameter tuning using CV. Evaluation Metric: Accuracy.

test score: .35

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# ==============================================================================
# 1. Data Preprocessing
# ==============================================================================
def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Convert "Age upon Intake" to days
    def parse_age(age_str):
        if pd.isna(age_str):
            return np.nan
        try:
            num = int(''.join(filter(str.isdigit, str(age_str))))
            unit = ''.join(filter(str.isalpha, str(age_str).lower()))
            if 'year' in unit:
                return num * 365
            elif 'month' in unit:
                return num * 30
            elif 'week' in unit:
                return num * 7
            elif 'day' in unit:
                return num
            return np.nan
        except:
            return np.nan

    df['Age in Days'] = df.get('Age upon Intake', '').apply(parse_age)

    # If Date of Birth exists, use it to fill missing Age
    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    # Additional feature engineering
    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract('([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories in Intake Condition and Animal Type
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if (is_train and 'Outcome Type' in df.columns) else None

    # Feature set selection
    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# ==============================================================================
# 2. Load the Data
# ==============================================================================
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess training and testing data
X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Remove 'Length of Stay' if missing in test set
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.reindex(columns=X.columns, fill_value=np.nan)

# ==============================================================================
# 3. Encode the Target Variable
# ==============================================================================
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==============================================================================
# 4. Split Data into Training and Validation Sets
# ==============================================================================
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ==============================================================================
# 5. Build Preprocessing Pipeline for Numeric and Categorical Variables
# ==============================================================================
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Identify numeric and categorical features
numeric_features = [col for col in X_train.columns if X_train[col].dtype in [np.int64, np.float64]]
categorical_features = [col for col in X_train.columns if col not in numeric_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ==============================================================================
# 6. Build the Stacking Classifier Model
# ==============================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

# Base models for StackingClassifier
logreg = LogisticRegression(max_iter=500, random_state=42, solver='liblinear')
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Meta-model for stacking
meta_model = LogisticRegression(random_state=42)

# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[('logreg', logreg), ('rf', rf), ('gb', gb)],
    final_estimator=meta_model,
    cv=3
)

# Pipeline with preprocessing and stacking
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', stacking_clf)
])

# ==============================================================================
# 7. Hyperparameter Tuning with RandomizedSearchCV
# ==============================================================================
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'clf__logreg__C': [0.01, 0.1, 1],
    'clf__rf__n_estimators': [100, 200],
    'clf__rf__max_depth': [5, 10],
    'clf__gb__n_estimators': [100, 200],
    'clf__gb__learning_rate': [0.05, 0.1]
}

random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, cv=3, n_jobs=-1, verbose=1)

print("Tuning the StackingClassifier ensemble...")
random_search.fit(X_train, y_train)

print("\nBest Parameters:", random_search.best_params_)
print(f"Best CV Balanced Accuracy: {random_search.best_score_:.4f}")

# ==============================================================================
# 8. Evaluate on the Validation Set
# ==============================================================================
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_bal_acc = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Balanced Accuracy: {val_bal_acc:.4f}")

# ==============================================================================
# 9. Retrain on Full Training Data and Predict Test Set Outcomes
# ==============================================================================
best_model.fit(X, y_encoded)
test_preds = best_model.predict(X_test)
test_preds_labels = le.inverse_transform(test_preds)

# ==============================================================================
# 10. Prepare Submission File
# ==============================================================================
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': test_preds_labels
})
submission.to_csv('submission_stacking_ensemble.csv', index=False)
print("\n📄 Submission file saved as 'submission_stacking_ensemble.csv'")
print(submission.head())


# Attempt 3:
Pre-processing the data (parsing dates, converting age information into days, encoding categorical features, and handling missing values using median for numeric and most common for categorical).
Ensemble method using VotingClassifier with Random Forest Classifier, XGBoost, and MLPClassifier.
Weighted voting applied to base models with adjusted weights based on validation results.
Hyperparameter tuning using RandomizedSearchCV with 10 iterations and 3-fold cross-validation.
Evaluation metric: Accuracy and Log Loss.

third- 0.40232

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier
from joblib import Memory

# Set up caching for the preprocessing step
memory = Memory(location='cachedir', verbose=0)

@memory.cache
def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns (vectorized)
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Vectorized age parsing using regex extraction
    age_extracted = df['Age upon Intake'].astype(str).str.extract(r'(?P<num>\d+)\s*(?P<unit>\w+)', expand=True)
    def convert_age(row):
        try:
            num = int(row['num'])
        except:
            return np.nan
        unit = row['unit'].lower() if pd.notna(row['unit']) else ''
        if 'year' in unit:
            return num * 365
        elif 'month' in unit:
            return num * 30
        elif 'week' in unit:
            return num * 7
        elif 'day' in unit:
            return num
        else:
            return np.nan
    df['Age in Days'] = age_extracted.apply(convert_age, axis=1)

    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    # Feature engineering for categorical variables
    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract(r'([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if is_train and 'Outcome Type' in df.columns else None

    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# === Load Data ===
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# === Preprocess (cached) ===
X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Remove 'Length of Stay' if not available in test set, then align columns
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.loc[:, X.columns]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Drop all-NaN columns (safety check)
X_train.dropna(axis=1, how='all', inplace=True)
X_val.dropna(axis=1, how='all', inplace=True)
X_test.dropna(axis=1, how='all', inplace=True)

# === Define Feature Lists for Preprocessing ===
numeric_features = [
    'Age in Days', 'Intake Year', 'Intake Month', 'Intake Day',
    'Intake Weekday', 'Intake Hour', 'Color Count'
]
categorical_features = [
    'Intake Type', 'Intake Condition', 'Animal Type', 'Sex',
    'Primary Color', 'Secondary Color'
]
if 'Length of Stay' in X_train.columns:
    numeric_features.append('Length of Stay')

numeric_features = [f for f in numeric_features if f in X_train.columns]
categorical_features = [f for f in categorical_features if f in X_train.columns]

# === Build Preprocessing Pipeline ===
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_features)
])

# === Build Combined VotingClassifier Pipeline with Weighted Voting ===
voting_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42, n_jobs=-1)),
            ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)),
            ('mlp', MLPClassifier(random_state=42, early_stopping=True))
        ],
        voting='soft',
        weights=[2, 3, 1],  # Adjust weights based on validation results
        n_jobs=-1
    ))
])

# === Define an Expanded but Reasonable Hyperparameter Space ===
param_grid = {
    'classifier__rf__n_estimators': [100, 150, 200],
    'classifier__rf__max_depth': [12, 15],
    'classifier__xgb__n_estimators': [100, 150, 200],
    'classifier__xgb__max_depth': [4, 6],
    'classifier__xgb__learning_rate': [0.1, 0.01],
    'classifier__mlp__hidden_layer_sizes': [(100,), (100, 50), (150, 100)],
    'classifier__mlp__alpha': [0.0001, 0.001],
    'classifier__mlp__activation': ['relu', 'tanh'],
    'classifier__mlp__max_iter': [300]
}

# Use RandomizedSearchCV with n_iter=10 and 3-fold CV for improved tuning
random_search = RandomizedSearchCV(voting_clf, param_distributions=param_grid,
                                   n_iter=10, cv=3, scoring='accuracy',
                                   n_jobs=-1, verbose=1, random_state=42)
print("Tuning the Combined VotingClassifier with Weighted Voting (RandomizedSearchCV)...")
random_search.fit(X_train, y_train)

# === Evaluate on Validation Set ===
best_model = random_search.best_estimator_
val_pred = best_model.predict(X_val)
val_proba = best_model.predict_proba(X_val)
print("\n--- Validation Results ---")
print("Best Parameters:", random_search.best_params_)
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"Validation Log Loss: {log_loss(y_val, val_proba):.4f}")

# === Retrain the Best Model on Full Data and Predict on Test Set ===
best_model.fit(X, y_encoded)
test_pred = best_model.predict(X_test)

# === Save Final Submission ===
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': le.inverse_transform(test_pred)
})
submission.to_csv('third.csv', index=False)
print("\n📄 Submission file saved as 'third.csv'")
print(submission.head())

  df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
  df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
  df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')


Tuning the Combined VotingClassifier with Weighted Voting (RandomizedSearchCV)...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

--- Validation Results ---
Best Parameters: {'classifier__xgb__n_estimators': 200, 'classifier__xgb__max_depth': 6, 'classifier__xgb__learning_rate': 0.1, 'classifier__rf__n_estimators': 150, 'classifier__rf__max_depth': 15, 'classifier__mlp__max_iter': 300, 'classifier__mlp__hidden_layer_sizes': (100,), 'classifier__mlp__alpha': 0.0001, 'classifier__mlp__activation': 'tanh'}
Validation Accuracy: 0.6453
Validation Log Loss: 0.8568





📄 Submission file saved as 'third.csv'
   Id     Outcome Type
0   1  Return to Owner
1   2         Transfer
2   3  Return to Owner
3   4         Adoption
4   5       Euthanasia


# Attempt 4:
Feature engineering with flags (e.g., IsMix, HasName, Neutered) and color features (PrimaryCol, SecondaryCol). Rare category replacement for Intake Condition and Animal Type. Early stopping for better model optimization based on the validation set.


In [None]:
# If in Colab, install:
# !pip install xgboost

import pandas as pd
import numpy as np

# 1. Feature engineering with guards
def feature_engineer(df, is_train=True):
    df = df.copy()
    # Intake datetime
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    dt = df['Intake Time'].dt
    df['Year'], df['Month'], df['Day'] = dt.year, dt.month, dt.day
    df['Weekday'], df['Hour'] = dt.weekday, dt.hour

    # StayDays
    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df['Outcome Time'], errors='coerce')
        df['StayDays'] = (df['Outcome Time'] - df['Intake Time'])\
                            .dt.total_seconds()/(24*3600)

    # AgeDays
    age_ex = df.get('Age upon Intake','').astype(str)\
                .str.extract(r'(\d+)\s*(\w+)', expand=True)
    def to_days(n,u):
        try: n = int(n)
        except: return np.nan
        u = (u or '').lower()
        if 'year' in u:  return n*365
        if 'month' in u: return n*30
        if 'week' in u:  return n*7
        if 'day' in u:   return n
        return np.nan
    df['AgeDays'] = age_ex.apply(lambda r: to_days(r[0], r[1]), axis=1)

    # Flags
    df['IsMix'] = df.get('Breed','')\
                     .str.contains('Mix', na=False).astype(int)
    sx = df.get('Sex upon Intake','')
    df['SexOnly']  = sx.str.extract(r'([A-Za-z]+)', expand=False).fillna('Unknown')
    df['Neutered']= sx.str.contains('Neutered|Spayed', na=False).astype(int)
    df['HasName']= df['Name'].notna().astype(int) if 'Name' in df.columns else 0
    df['InAustin']= df.get('Found Location','')\
                       .str.contains('Austin', na=False).astype(int)

    # Color features
    col = df.get('Color','').fillna('')
    df['NumCols']      = col.str.count('/')+1
    df['PrimaryCol']   = col.str.split('/').str[0].replace('', 'None')
    df['SecondaryCol'] = col.str.split('/').str[1].fillna('None')

    # Rare→Other
    for c in ['Intake Condition','Animal Type']:
        if c in df.columns:
            vc = df[c].value_counts()
            rare = vc[vc<5].index
            df[c] = df[c].replace(rare,'Other')

    # Target
    y = df['Outcome Type'] if is_train and 'Outcome Type' in df.columns else None

    # Keep list
    keep = [
        'Intake Type','Intake Condition','Animal Type','SexOnly',
        'AgeDays','IsMix','HasName','InAustin',
        'Year','Month','Day','Weekday','Hour',
        'NumCols','PrimaryCol','SecondaryCol'
    ]
    if is_train and 'StayDays' in df.columns:
        keep.append('StayDays')

    return (df[keep], y) if is_train else df[keep]

# 2. Load & engineer
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

X, y   = feature_engineer(train, is_train=True)
X_test = feature_engineer(test,  is_train=False)

# 3. Align columns
if 'StayDays' in X.columns and 'StayDays' not in X_test.columns:
    X = X.drop('StayDays', axis=1)
X_test = X_test[X.columns]

# 4. Frequency‐encode categoricals
cat_cols = ['Intake Type','Intake Condition','Animal Type',
            'SexOnly','PrimaryCol','SecondaryCol']
for c in cat_cols:
    freq = X[c].value_counts(normalize=True)
    X[c+'_freq']      = X[c].map(freq)
    X_test[c+'_freq'] = X_test[c].map(freq).fillna(0.0)

# 5. Numeric features
num_feats = ['AgeDays','IsMix','HasName','InAustin',
             'Year','Month','Day','Weekday','Hour','NumCols'] \
            + [c+'_freq' for c in cat_cols]

# 6. Train/val split & label encode
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_enc = le.fit_transform(y)

# drop classes <2 samples
vc   = pd.Series(y_enc).value_counts()
mask = ~pd.Series(y_enc).isin(vc[vc<2].index)
X, y_enc = X[mask], y_enc[mask]

X_train, X_val, y_train, y_val = train_test_split(
    X[num_feats], y_enc,
    test_size=0.2, stratify=y_enc, random_state=42
)

# fill NaNs
X_train.fillna(-1, inplace=True)
X_val.fillna(-1, inplace=True)
X_test.fillna(-1, inplace=True)

# 7. Train via xgb.train for early stopping
# 7. Train via xgb.train for early stopping
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)
dall   = xgb.DMatrix(pd.concat([X_train, X_val]), label=np.concatenate([y_train, y_val]))
dtest  = xgb.DMatrix(X_test[num_feats])   # <- use only numeric columns here

num_class = len(np.unique(y_enc))
params = {
    'objective':        'multi:softprob',
    'num_class':        num_class,
    'learning_rate':    0.1,
    'max_depth':        6,
    'subsample':        0.8,
    'colsample_bytree': 0.8,
    'eval_metric':      'mlogloss',
    'seed':             42
}

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dval, 'validation')],
    early_stopping_rounds=20,
    verbose_eval=50
)

# 8. Validate
val_pred   = bst.predict(dval)
val_labels = np.argmax(val_pred, axis=1)
from sklearn.metrics import accuracy_score, log_loss
print("Validation Accuracy:", accuracy_score(y_val, val_labels))
print("Validation Log Loss:", log_loss(y_val, val_pred))

# 9. Retrain on all data
bst = xgb.train(
    params,
    dall,
    num_boost_round=bst.best_iteration
)

# 10. Predict & submit
test_prob = bst.predict(dtest)
test_pred = np.argmax(test_prob, axis=1)

submission = pd.DataFrame({
    'Id':            test['Id'],
    'Outcome Type':  le.inverse_transform(test_pred)
})
submission.to_csv('submission_xgb_train.csv', index=False)
print(submission.head())


# 8. Validate
val_pred  = bst.predict(dval)
val_labels= np.argmax(val_pred, axis=1)
from sklearn.metrics import accuracy_score, log_loss
print("Validation Accuracy:", accuracy_score(y_val, val_labels))
print("Validation Log Loss:", log_loss(y_val, val_pred))

# 9. Retrain on all data
bst = xgb.train(
    params,
    dall,
    num_boost_round=bst.best_iteration  # use best_iteration from above
)

# 10. Predict & submit
test_prob = bst.predict(dtest)
test_pred = np.argmax(test_prob, axis=1)

submission = pd.DataFrame({
    'Id':            test['Id'],
    'Outcome Type':  le.inverse_transform(test_pred)
})
submission.to_csv('submission_xgb_train.csv', index=False)
print(submission.head())


  df['Outcome Time'] = pd.to_datetime(df['Outcome Time'], errors='coerce')
  df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')


[0]	validation-mlogloss:1.52248
[50]	validation-mlogloss:0.81579
[100]	validation-mlogloss:0.79474
[150]	validation-mlogloss:0.78990
[200]	validation-mlogloss:0.78692
[250]	validation-mlogloss:0.78545
[300]	validation-mlogloss:0.78467
[350]	validation-mlogloss:0.78385
[400]	validation-mlogloss:0.78367
[412]	validation-mlogloss:0.78370
Validation Accuracy: 0.6832493702770781
Validation Log Loss: 0.7836408330144823
   Id Outcome Type
0   1     Adoption
1   2     Transfer
2   3     Adoption
3   4     Adoption
4   5   Euthanasia
Validation Accuracy: 0.7263853904282116
Validation Log Loss: 0.6654180973025768


AttributeError: `best_iteration` is only defined when early stopping is used.

In [None]:
!pip install imbalanced-learn xgboost




# 5th Attempt:
Tried to use OneVsRestClassifier with a Logistic Regression classifier. Hyperparameter tuning using GridSearchCV.


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss

# ---------------------------
# 1. Preprocessing Function
# ---------------------------
def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Parse Age upon Intake using regex extraction (e.g. "2 years")
    age_extracted = df['Age upon Intake'].astype(str).str.extract(r'(?P<num>\d+)\s*(?P<unit>\w+)', expand=True)
    def convert_age(row):
        try:
            num = int(row['num'])
        except:
            return np.nan
        unit = row['unit'].lower() if pd.notna(row['unit']) else ''
        if 'year' in unit:
            return num * 365
        elif 'month' in unit:
            return num * 30
        elif 'week' in unit:
            return num * 7
        elif 'day' in unit:
            return num
        else:
            return np.nan
    df['Age in Days'] = age_extracted.apply(convert_age, axis=1)

    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    # Feature engineering for categorical/text features
    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract(r'([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories in these columns
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if is_train and 'Outcome Type' in df.columns else None

    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# ---------------------------
# 2. Load Data and Preprocess
# ---------------------------
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# If "Length of Stay" exists in training but not in test, drop it and align columns
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.loc[:, X.columns]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ---------------------------
# 3. Build Preprocessing Pipeline (One-Hot Encoding for Categoricals)
# ---------------------------
numeric_features = ['Age in Days', 'Intake Year', 'Intake Month', 'Intake Day',
                    'Intake Weekday', 'Intake Hour', 'Color Count']
categorical_features = ['Intake Type', 'Intake Condition', 'Animal Type', 'Sex',
                        'Primary Color', 'Secondary Color']
if 'Length of Stay' in X_train.columns:
    numeric_features.append('Length of Stay')
numeric_features = [f for f in numeric_features if f in X_train.columns]
categorical_features = [f for f in categorical_features if f in X_train.columns]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# ---------------------------
# 4. Build One-vs-Rest Classifier with Logistic Regression
# ---------------------------
ovr_model = OneVsRestClassifier(LogisticRegression(max_iter=500, random_state=42, solver='liblinear'))

# ---------------------------
# 5. Hyperparameter Tuning using GridSearchCV
# ---------------------------
param_grid = {
    'estimator__C': [0.01, 0.1, 1, 10],
    'estimator__penalty': ['l2'],  # We can also tune 'l1' if using other solvers like 'saga'
    'estimator__solver': ['liblinear'],
    'estimator__max_iter': [100, 500, 1000]
}

grid_search = GridSearchCV(ovr_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

print("Tuning the One-vs-Rest Classifier (Logistic Regression)...")
grid_search.fit(X_train, y_train)

# ---------------------------
# 6. Evaluate on the Validation Set
# ---------------------------
best_model = grid_search.best_estimator_
val_pred = best_model.predict(X_val)
val_proba = best_model.predict_proba(X_val)

print("\n--- Validation Results ---")
print("Best Parameters:", grid_search.best_params_)
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"Validation Log Loss: {log_loss(y_val, val_proba):.4f}")

# ---------------------------
# 7. Retrain Best Model on Full Data and Predict on Test Set
# ---------------------------
best_model.fit(X, y_encoded)
test_pred = best_model.predict(X_test)

# ---------------------------
# 8. Save Final Submission
# ---------------------------
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': le.inverse_transform(test_pred)
})
submission.to_csv('submission_ovr_lr_model.csv', index=False)
print("\n📄 Submission file saved as 'submission_ovr_lr_model.csv'")
print(submission.head())


Tuning the One-vs-Rest Classifier (Logistic Regression)...
Fitting 3 folds for each of 12 candidates, totalling 36 fits


ValueError: 
All the 36 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/multiclass.py", line 376, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/multiclass.py", line 96, in _fit_binary
    estimator.fit(X, y, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py", line 1222, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Stray'


# Attempt 6: Stacking Classifier ensemble with Logistic Regression, Random Forest, and Gradient Boosting as base models and Logistic Regression as the meta-model. Uses F-1 as evaluation metric to account for class imbalance.

Test results: .37

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss, make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# ==============================================================================
# 1. Data Preprocessing
# ==============================================================================
def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Convert "Age upon Intake" to days
    def parse_age(age_str):
        if pd.isna(age_str):
            return np.nan
        try:
            num = int(''.join(filter(str.isdigit, str(age_str))))
            unit = ''.join(filter(str.isalpha, str(age_str).lower()))
            if 'year' in unit:
                return num * 365
            elif 'month' in unit:
                return num * 30
            elif 'week' in unit:
                return num * 7
            elif 'day' in unit:
                return num
            return np.nan
        except:
            return np.nan

    df['Age in Days'] = df.get('Age upon Intake', '').apply(parse_age)

    # If Date of Birth exists, use it to fill missing Age
    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    # Additional feature engineering
    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract('([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories in Intake Condition and Animal Type
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if (is_train and 'Outcome Type' in df.columns) else None

    # Feature set selection
    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# ==============================================================================
# 2. Load the Data
# ==============================================================================
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess training and testing data
X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Remove 'Length of Stay' if missing in test set
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.reindex(columns=X.columns, fill_value=np.nan)

# ==============================================================================
# 3. Encode the Target Variable
# ==============================================================================
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==============================================================================
# 4. Split Data into Training and Validation Sets
# ==============================================================================
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ==============================================================================
# 5. Build Preprocessing Pipeline for Numeric and Categorical Variables
# ==============================================================================
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Identify numeric and categorical features
numeric_features = [col for col in X_train.columns if X_train[col].dtype in [np.int64, np.float64]]
categorical_features = [col for col in X_train.columns if col not in numeric_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ==============================================================================
# 6. Build the Stacking Classifier Model
# ==============================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

# Base models for StackingClassifier
logreg = LogisticRegression(max_iter=500, random_state=42, solver='liblinear')
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Meta-model for stacking
meta_model = LogisticRegression(random_state=42)

# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[('logreg', logreg), ('rf', rf), ('gb', gb)],
    final_estimator=meta_model,
    cv=3
)

# Pipeline with preprocessing and stacking
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', stacking_clf)
])

# ==============================================================================
# 7. Hyperparameter Tuning with RandomizedSearchCV using F1 Score
# ==============================================================================
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'clf__logreg__C': [0.01, 0.1, 1],
    'clf__rf__n_estimators': [100, 200],
    'clf__rf__max_depth': [5, 10],
    'clf__gb__n_estimators': [100, 200],
    'clf__gb__learning_rate': [0.05, 0.1]
}

# Use F1 score as the scoring metric
f1_scorer = make_scorer(f1_score, average='weighted')

random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, cv=3, n_jobs=-1, verbose=1, scoring=f1_scorer)

print("Tuning the StackingClassifier ensemble...")
random_search.fit(X_train, y_train)

print("\nBest Parameters:", random_search.best_params_)
print(f"Best CV F1-Score: {random_search.best_score_:.4f}")

# ==============================================================================
# 8. Evaluate on the Validation Set
# ==============================================================================
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_f1_score = f1_score(y_val, y_val_pred, average='weighted')
print(f"\nValidation F1-Score: {val_f1_score:.4f}")

# ==============================================================================
# 9. Retrain on Full Training Data and Predict Test Set Outcomes
# ==============================================================================
best_model.fit(X, y_encoded)
test_preds = best_model.predict(X_test)
test_preds_labels = le.inverse_transform(test_preds)

# ==============================================================================
# 10. Prepare Submission File
# ==============================================================================
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': test_preds_labels
})
submission.to_csv('submission_stacking_ensemble_f1.csv', index=False)
print("\n📄 Submission file saved as 'submission_stacking_ensemble_f1.csv'")
print(submission.head())


  df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
  df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
  df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')


Tuning the StackingClassifier ensemble...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
