0.39467

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier
import scipy.stats as st

# === Preprocessing Function ===
def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Function to convert age strings to days
    def parse_age(age_str):
        if pd.isna(age_str):
            return np.nan
        try:
            num = int(''.join(filter(str.isdigit, str(age_str))))
            unit = ''.join(filter(str.isalpha, str(age_str).lower()))
            if 'year' in unit:
                return num * 365
            elif 'month' in unit:
                return num * 30
            elif 'week' in unit:
                return num * 7
            elif 'day' in unit:
                return num
            return np.nan
        except:
            return np.nan

    df['Age in Days'] = df.get('Age upon Intake', '').apply(parse_age)

    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract('([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories in these columns
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if is_train and 'Outcome Type' in df.columns else None

    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# === Load Data ===
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# === Preprocess Data ===
X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Remove 'Length of Stay' from training if missing in test set
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.loc[:, X.columns]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Extra safety: drop all-NaN columns
X_train.dropna(axis=1, how='all', inplace=True)
X_val.dropna(axis=1, how='all', inplace=True)
X_test.dropna(axis=1, how='all', inplace=True)

# === Define Feature Lists for Preprocessing ===
numeric_features = [
    'Age in Days', 'Intake Year', 'Intake Month', 'Intake Day',
    'Intake Weekday', 'Intake Hour', 'Color Count'
]
categorical_features = [
    'Intake Type', 'Intake Condition', 'Animal Type', 'Sex',
    'Primary Color', 'Secondary Color'
]
if 'Length of Stay' in X_train.columns:
    numeric_features.append('Length of Stay')

numeric_features = [f for f in numeric_features if f in X_train.columns]
categorical_features = [f for f in categorical_features if f in X_train.columns]

# === Build Preprocessing Pipeline ===
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_features)
])

# === Build Combined VotingClassifier Pipeline ===
# This VotingClassifier combines three models: RandomForest, XGBoost, and MLP.
voting_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42, n_jobs=-1)),
            ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)),
            ('mlp', MLPClassifier(random_state=42))
        ],
        voting='soft', n_jobs=-1
    ))
])

# === Define a Reduced Hyperparameter Space for Faster Tuning ===
param_grid = {
    'classifier__rf__n_estimators': [100, 150],
    'classifier__rf__max_depth': [12],
    'classifier__xgb__n_estimators': [100, 150],
    'classifier__xgb__max_depth': [4],
    'classifier__xgb__learning_rate': [0.1],
    'classifier__mlp__hidden_layer_sizes': [(100,), (100, 50)],
    'classifier__mlp__alpha': [0.0001],
    'classifier__mlp__activation': ['relu'],
    'classifier__mlp__max_iter': [300]
}

# Use RandomizedSearchCV for a faster search (20 iterations)
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(voting_clf, param_distributions=param_grid,
                                   n_iter=20, cv=3, scoring='accuracy',
                                   n_jobs=-1, verbose=1, random_state=42)
print("Tuning the Combined VotingClassifier (RandomizedSearchCV)...")
random_search.fit(X_train, y_train)

# === Evaluate on the Validation Set ===
best_model = random_search.best_estimator_
val_pred = best_model.predict(X_val)
val_proba = best_model.predict_proba(X_val)

print("\n--- Validation Results ---")
print("Best Parameters:", random_search.best_params_)
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"Validation Log Loss: {log_loss(y_val, val_proba):.4f}")

# === Retrain the Best Model on Full Training Data ===
best_model.fit(X, y_encoded)
test_pred = best_model.predict(X_test)

# === Save Final Submission ===
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': le.inverse_transform(test_pred)
})
submission.to_csv('submission_combined_optimized.csv', index=False)
print("\n📄 Submission file saved as 'submission_combined_optimized.csv'")
print(submission.head())


  df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
  df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
  df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')


Tuning the Combined VotingClassifier (RandomizedSearchCV)...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

--- Validation Results ---
Best Parameters: {'classifier__xgb__n_estimators': 150, 'classifier__xgb__max_depth': 4, 'classifier__xgb__learning_rate': 0.1, 'classifier__rf__n_estimators': 150, 'classifier__rf__max_depth': 12, 'classifier__mlp__max_iter': 300, 'classifier__mlp__hidden_layer_sizes': (100, 50), 'classifier__mlp__alpha': 0.0001, 'classifier__mlp__activation': 'relu'}
Validation Accuracy: 0.6376
Validation Log Loss: 0.8766





📄 Submission file saved as 'submission_optimized.csv'
   Id     Outcome Type
0   1  Return to Owner
1   2         Transfer
2   3  Return to Owner
3   4         Adoption
4   5         Transfer


third- 0.40232

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier
from joblib import Memory
import re

# Set up caching for the preprocessing step
memory = Memory(location='cachedir', verbose=0)

@memory.cache
def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns (vectorized)
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Vectorized age parsing using regex extraction
    age_extracted = df['Age upon Intake'].astype(str).str.extract(r'(?P<num>\d+)\s*(?P<unit>\w+)', expand=True)
    def convert_age(row):
        try:
            num = int(row['num'])
        except:
            return np.nan
        unit = row['unit'].lower() if pd.notna(row['unit']) else ''
        if 'year' in unit:
            return num * 365
        elif 'month' in unit:
            return num * 30
        elif 'week' in unit:
            return num * 7
        elif 'day' in unit:
            return num
        else:
            return np.nan
    df['Age in Days'] = age_extracted.apply(convert_age, axis=1)

    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    # Feature engineering for categorical variables
    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract(r'([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if is_train and 'Outcome Type' in df.columns else None

    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# === Load Data ===
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# === Preprocess (cached) ===
X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Remove 'Length of Stay' if not available in test set, then align columns
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.loc[:, X.columns]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Drop all-NaN columns (safety check)
X_train.dropna(axis=1, how='all', inplace=True)
X_val.dropna(axis=1, how='all', inplace=True)
X_test.dropna(axis=1, how='all', inplace=True)

# === Define Feature Lists for Preprocessing ===
numeric_features = [
    'Age in Days', 'Intake Year', 'Intake Month', 'Intake Day',
    'Intake Weekday', 'Intake Hour', 'Color Count'
]
categorical_features = [
    'Intake Type', 'Intake Condition', 'Animal Type', 'Sex',
    'Primary Color', 'Secondary Color'
]
if 'Length of Stay' in X_train.columns:
    numeric_features.append('Length of Stay')

numeric_features = [f for f in numeric_features if f in X_train.columns]
categorical_features = [f for f in categorical_features if f in X_train.columns]

# === Build Preprocessing Pipeline ===
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_features)
])

# === Build Combined VotingClassifier Pipeline with Weighted Voting ===
voting_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42, n_jobs=-1)),
            ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)),
            ('mlp', MLPClassifier(random_state=42, early_stopping=True))
        ],
        voting='soft',
        weights=[2, 3, 1],  # Adjust weights based on validation results
        n_jobs=-1
    ))
])

# === Define an Expanded but Reasonable Hyperparameter Space ===
param_grid = {
    'classifier__rf__n_estimators': [100, 150, 200],
    'classifier__rf__max_depth': [12, 15],
    'classifier__xgb__n_estimators': [100, 150, 200],
    'classifier__xgb__max_depth': [4, 6],
    'classifier__xgb__learning_rate': [0.1, 0.01],
    'classifier__mlp__hidden_layer_sizes': [(100,), (100, 50), (150, 100)],
    'classifier__mlp__alpha': [0.0001, 0.001],
    'classifier__mlp__activation': ['relu', 'tanh'],
    'classifier__mlp__max_iter': [300]
}

# Use RandomizedSearchCV with n_iter=10 and 3-fold CV for improved tuning
random_search = RandomizedSearchCV(voting_clf, param_distributions=param_grid,
                                   n_iter=10, cv=3, scoring='accuracy',
                                   n_jobs=-1, verbose=1, random_state=42)
print("Tuning the Combined VotingClassifier with Weighted Voting (RandomizedSearchCV)...")
random_search.fit(X_train, y_train)

# === Evaluate on Validation Set ===
best_model = random_search.best_estimator_
val_pred = best_model.predict(X_val)
val_proba = best_model.predict_proba(X_val)
print("\n--- Validation Results ---")
print("Best Parameters:", random_search.best_params_)
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"Validation Log Loss: {log_loss(y_val, val_proba):.4f}")

# === Retrain the Best Model on Full Data and Predict on Test Set ===
best_model.fit(X, y_encoded)
test_pred = best_model.predict(X_test)

# === Save Final Submission ===
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': le.inverse_transform(test_pred)
})
submission.to_csv('third.csv', index=False)
print("\n📄 Submission file saved as 'third.csv'")
print(submission.head())

  df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
  df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
  df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')


Tuning the Combined VotingClassifier with Weighted Voting (RandomizedSearchCV)...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

--- Validation Results ---
Best Parameters: {'classifier__xgb__n_estimators': 200, 'classifier__xgb__max_depth': 6, 'classifier__xgb__learning_rate': 0.1, 'classifier__rf__n_estimators': 150, 'classifier__rf__max_depth': 15, 'classifier__mlp__max_iter': 300, 'classifier__mlp__hidden_layer_sizes': (100,), 'classifier__mlp__alpha': 0.0001, 'classifier__mlp__activation': 'tanh'}
Validation Accuracy: 0.6453
Validation Log Loss: 0.8568





📄 Submission file saved as 'third.csv'
   Id     Outcome Type
0   1  Return to Owner
1   2         Transfer
2   3  Return to Owner
3   4         Adoption
4   5       Euthanasia


4th-

In [None]:
!pip install imbalanced-learn xgboost




In [None]:
import pandas as pd
import numpy as np
import re
import warnings

# Optionally suppress date parsing warnings (informational only)
warnings.filterwarnings("ignore", message="Could not infer format")

# ---------------------------
# 1. Import Required Modules
# ---------------------------
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline  # used in building sub-pipelines

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.cluster import KMeans

from joblib import Memory

# ---------------------------
# 2. Define a Custom Transformer: AddKMeansClusters
# ---------------------------
from sklearn.base import BaseEstimator, TransformerMixin

class AddKMeansClusters(BaseEstimator, TransformerMixin):
    """
    Fits KMeans on the data and appends one-hot encoded cluster labels as new features.
    """
    def __init__(self, n_clusters=5):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)

    def fit(self, X, y=None):
        self.kmeans.fit(X)
        return self

    def transform(self, X):
        clusters = self.kmeans.predict(X)
        # One-hot encode cluster labels using pandas.get_dummies
        cluster_dummies = pd.get_dummies(clusters, prefix="cluster")
        return np.hstack([X, cluster_dummies.values])

# ---------------------------
# 3. Preprocessing Function (with caching)
# ---------------------------
memory = Memory(location='cachedir', verbose=0)

def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Parse Age upon Intake using regex extraction (e.g. "2 years")
    age_extracted = df['Age upon Intake'].astype(str).str.extract(r'(?P<num>\d+)\s*(?P<unit>\w+)', expand=True)
    def convert_age(row):
        try:
            num = int(row['num'])
        except:
            return np.nan
        unit = row['unit'].lower() if pd.notna(row['unit']) else ''
        if 'year' in unit:
            return num * 365
        elif 'month' in unit:
            return num * 30
        elif 'week' in unit:
            return num * 7
        elif 'day' in unit:
            return num
        else:
            return np.nan
    df['Age in Days'] = age_extracted.apply(convert_age, axis=1)

    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    # Feature engineering for categorical/text features
    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract(r'([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories in these columns
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if is_train and 'Outcome Type' in df.columns else None

    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# Cache the preprocessing function
preprocess_data = memory.cache(preprocess_data)

# ---------------------------
# 4. Load Data and Preprocess
# ---------------------------
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# If "Length of Stay" exists in training but not in test, drop it and align columns
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.loc[:, X.columns]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Remove rare classes (if any class appears fewer than 2 times)
class_counts = pd.Series(y_encoded).value_counts()
rare_classes = class_counts[class_counts < 2].index
if len(rare_classes) > 0:
    indices_to_keep = [i for i, label in enumerate(y_encoded) if label not in rare_classes]
    X = X.iloc[indices_to_keep]
    y_encoded = np.array(y_encoded)[indices_to_keep]

# Split into training and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Safety: drop any remaining all-NaN columns
X_train.dropna(axis=1, how='all', inplace=True)
X_val.dropna(axis=1, how='all', inplace=True)
X_test.dropna(axis=1, how='all', inplace=True)

# ---------------------------
# 5. Build Preprocessing Pipeline (One-Hot Encoding for Categoricals)
# ---------------------------
numeric_features = ['Age in Days', 'Intake Year', 'Intake Month', 'Intake Day',
                    'Intake Weekday', 'Intake Hour', 'Color Count']
categorical_features = ['Intake Type', 'Intake Condition', 'Animal Type', 'Sex',
                        'Primary Color', 'Secondary Color']
if 'Length of Stay' in X_train.columns:
    numeric_features.append('Length of Stay')
numeric_features = [f for f in numeric_features if f in X_train.columns]
categorical_features = [f for f in categorical_features if f in X_train.columns]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# ---------------------------
# 6. Build Advanced Pipeline: PCA and K-Means Clustering (Flattened)
# ---------------------------
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

# Instead of using a nested Pipeline, we build a flattened list of steps.
advanced_steps = [
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=50, random_state=42)),
    ('clusters', AddKMeansClusters(n_clusters=5))
]
# Build advanced pipeline using a list of steps (flattened)
advanced_pipeline = Pipeline(steps=advanced_steps)

# ---------------------------
# 7. Build Stacking Classifier as the Ensemble
# ---------------------------
from sklearn.ensemble import StackingClassifier

base_estimators = [
    ('rf', RandomForestClassifier(n_estimators=150, max_depth=12, random_state=42, n_jobs=-1)),
    ('xgb', XGBClassifier(n_estimators=150, max_depth=4, learning_rate=0.1,
                          use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100,50), alpha=0.0001, activation='relu',
                          max_iter=300, random_state=42))
]
meta_learner = LogisticRegression(max_iter=1000, random_state=42)

stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_learner,
    cv=3,
    n_jobs=-1,
    passthrough=False
)

# ---------------------------
# 8. Build Combined Pipeline with SMOTE using imblearn's Pipeline (Flattened)
# ---------------------------
combined_steps = [
    ('advanced_preproc', advanced_pipeline),
    ('smote', SMOTE(random_state=42)),
    ('classifier', stacking_clf)
]
combined_pipeline = ImbPipeline(steps=combined_steps)

# ---------------------------
# 9. Hyperparameter Tuning for the Advanced Pipeline
# ---------------------------
param_grid = {
    'advanced_preproc__pca__n_components': [30, 50, 70],
    'advanced_preproc__clusters__n_clusters': [3, 5, 7],
    'classifier__rf__n_estimators': [100, 150, 200],
    'classifier__rf__max_depth': [12, 15],
    'classifier__xgb__n_estimators': [100, 150, 200],
    'classifier__xgb__max_depth': [4, 6],
    'classifier__xgb__learning_rate': [0.1, 0.01],
    'classifier__mlp__hidden_layer_sizes': [(100,), (100,50), (150,100)],
    'classifier__mlp__alpha': [0.0001, 0.001],
    'classifier__mlp__activation': ['relu', 'tanh']
}

random_search = RandomizedSearchCV(combined_pipeline, param_distributions=param_grid,
                                   n_iter=10, cv=3, scoring='accuracy',
                                   n_jobs=-1, verbose=1, random_state=42)
print("Tuning the Advanced Pipeline (PCA + K-Means + SMOTE + Stacking)...")
random_search.fit(X_train, y_train)

# ---------------------------
# 10. Evaluate on the Validation Set
# ---------------------------
best_model = random_search.best_estimator_
val_pred = best_model.predict(X_val)
val_proba = best_model.predict_proba(X_val)
print("\n--- Validation Results ---")
print("Best Parameters:", random_search.best_params_)
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"Validation Log Loss: {log_loss(y_val, val_proba):.4f}")

# ---------------------------
# 11. Retrain Best Model on Full Data and Predict on Test Set
# ---------------------------
best_model.fit(X, y_encoded)
test_pred = best_model.predict(X_test)

# ---------------------------
# 12. Save Final Submission
# ---------------------------
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': le.inverse_transform(test_pred)
})
submission.to_csv('4.csv', index=False)
print("\n📄 Submission file saved as '4.csv'")
print(submission.head())


Tuning the Advanced Pipeline (PCA + K-Means + SMOTE + Stacking)...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/imblearn/pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/imblearn/pipeline.py", line 430, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/imblearn/pipeline.py", line 1383, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-d4f8d8ab32c7>", line 254, in transform
  File "/usr/local/lib/python3.11/dist-packages/numpy/_core/shape_base.py", line 356, in hstack
    return _nx.concatenate(arrs, 0, dtype=dtype, casting=casting)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)


In [None]:
import pandas as pd
import numpy as np

# ==============================================================================
# 1. Data Preprocessing (Using Techniques from Your Example)
# ==============================================================================

def preprocess_data(df, is_train=True):
    df = df.copy()

    # Parse datetime columns
    df['Intake Time'] = pd.to_datetime(df.get('Intake Time'), errors='coerce')
    df['Intake Year'] = df['Intake Time'].dt.year
    df['Intake Month'] = df['Intake Time'].dt.month
    df['Intake Day'] = df['Intake Time'].dt.day
    df['Intake Weekday'] = df['Intake Time'].dt.weekday
    df['Intake Hour'] = df['Intake Time'].dt.hour

    if is_train and 'Outcome Time' in df.columns:
        df['Outcome Time'] = pd.to_datetime(df.get('Outcome Time'), errors='coerce')
        df['Length of Stay'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (24 * 3600)

    # Convert "Age upon Intake" to days
    def parse_age(age_str):
        if pd.isna(age_str):
            return np.nan
        try:
            num = int(''.join(filter(str.isdigit, str(age_str))))
            unit = ''.join(filter(str.isalpha, str(age_str).lower()))
            if 'year' in unit:
                return num * 365
            elif 'month' in unit:
                return num * 30
            elif 'week' in unit:
                return num * 7
            elif 'day' in unit:
                return num
            return np.nan
        except:
            return np.nan

    df['Age in Days'] = df.get('Age upon Intake', '').apply(parse_age)

    # If Date of Birth exists, use it to fill missing Age
    if 'Date of Birth' in df.columns:
        df['Date of Birth'] = pd.to_datetime(df.get('Date of Birth'), errors='coerce')
        df['Age from DOB'] = (df['Intake Time'] - df['Date of Birth']).dt.days
        df['Age in Days'] = df['Age in Days'].fillna(df['Age from DOB'])

    # Additional feature engineering
    df['Is Mix'] = df.get('Breed', '').str.contains('Mix', case=False, na=False).astype(int)
    df['Primary Color'] = df.get('Color', '').str.split('/').str[0]
    df['Secondary Color'] = df.get('Color', '').str.split('/').str[1].fillna('None')
    df['Color Count'] = df.get('Color', '').str.count('/') + 1
    df['Sex'] = df.get('Sex upon Intake', '').str.extract('([A-Za-z]+)', expand=False)
    df['Neutered'] = df.get('Sex upon Intake', '').str.contains('Neutered|Spayed', case=False, na=False).astype(int)
    df['Has Name'] = df.get('Name').notna().astype(int) if 'Name' in df.columns else 0
    df['In Austin'] = df.get('Found Location', '').str.contains('Austin', case=False, na=False).astype(int)

    # Replace rare categories in Intake Condition and Animal Type
    for col in ['Intake Condition', 'Animal Type']:
        if col in df.columns:
            counts = df[col].value_counts()
            rare = counts[counts < 5].index
            df[col] = df[col].replace(rare, 'Other')

    outcome = df['Outcome Type'] if (is_train and 'Outcome Type' in df.columns) else None

    # Feature set selection
    features = [
        'Intake Type', 'Intake Condition', 'Animal Type', 'Sex', 'Neutered',
        'Age in Days', 'Is Mix', 'Has Name', 'In Austin',
        'Intake Year', 'Intake Month', 'Intake Day', 'Intake Weekday', 'Intake Hour',
        'Primary Color', 'Secondary Color', 'Color Count'
    ]
    # Include Length of Stay for training if present
    if is_train and 'Length of Stay' in df.columns:
        features.append('Length of Stay')

    # Keep only the features available in the DataFrame
    df = df[[col for col in features if col in df.columns]]
    df.dropna(axis=1, how='all', inplace=True)

    return (df, outcome) if is_train else df

# ==============================================================================
# 2. Load the Data
# ==============================================================================
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess training and testing data
X, y = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Remove 'Length of Stay' if missing in test set
if 'Length of Stay' in X.columns and 'Length of Stay' not in X_test.columns:
    X = X.drop(columns=['Length of Stay'])
X_test = X_test.reindex(columns=X.columns, fill_value=np.nan)

# ==============================================================================
# 3. Encode the Target Variable
# ==============================================================================
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==============================================================================
# 4. Split Data into Training and Validation Sets
# ==============================================================================
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ==============================================================================
# 5. Build Preprocessing Pipeline for Numeric and Categorical Variables
# ==============================================================================
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Identify numeric and categorical features
numeric_features = [col for col in X_train.columns if X_train[col].dtype in [np.int64, np.float64]]
categorical_features = [col for col in X_train.columns if col not in numeric_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ==============================================================================
# 6. Build an Ensemble Model using OneVsRestClassifier
# ==============================================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

# Base estimator – can be substituted with others like SVM or KNN if desired.
base_estimator = RandomForestClassifier(random_state=42)
ovr_classifier = OneVsRestClassifier(base_estimator)

# Combine preprocessing and classification into one pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', ovr_classifier)
])

# ==============================================================================
# 7. Hyperparameter Tuning with GridSearchCV
# ==============================================================================
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, make_scorer

param_grid = {
    # Tuning parameters for the base RandomForestClassifier inside the OneVsRestClassifier
    'clf__estimator__n_estimators': [100, 200],
    'clf__estimator__max_depth': [5, 10],
    'clf__estimator__min_samples_split': [2, 5]
}

# Use balanced_accuracy as the scoring metric
scorer = make_scorer(balanced_accuracy_score)
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=cv_strategy, n_jobs=-1, verbose=1)

print("Tuning the OneVsRestClassifier ensemble...")
grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)
print(f"Best CV Balanced Accuracy: {grid_search.best_score_:.4f}")

# ==============================================================================
# 8. Evaluate on the Validation Set
# ==============================================================================
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_bal_acc = balanced_accuracy_score(y_val, y_val_pred)
print(f"\nValidation Balanced Accuracy: {val_bal_acc:.4f}")

# ==============================================================================
# 9. Retrain on Full Training Data and Predict Test Set Outcomes
# ==============================================================================
best_model.fit(X, y_encoded)
test_preds = best_model.predict(X_test)
test_preds_labels = le.inverse_transform(test_preds)

# ==============================================================================
# 10. Prepare Submission File
# ==============================================================================
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Outcome Type': test_preds_labels
})
submission.to_csv('submission_ovr_ensemble.csv', index=False)
print("\n📄 Submission file saved as 'submission_ovr_ensemble.csv'")
print(submission.head())
