In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/petfinder-adoption-prediction/state_labels.csv
/kaggle/input/petfinder-adoption-prediction/color_labels.csv
/kaggle/input/petfinder-adoption-prediction/StateLabels.csv
/kaggle/input/petfinder-adoption-prediction/PetFinder-BreedLabels.csv
/kaggle/input/petfinder-adoption-prediction/BreedLabels.csv
/kaggle/input/petfinder-adoption-prediction/ColorLabels.csv
/kaggle/input/petfinder-adoption-prediction/PetFinder-ColorLabels.csv
/kaggle/input/petfinder-adoption-prediction/PetFinder-StateLabels.csv
/kaggle/input/petfinder-adoption-prediction/breed_labels.csv
/kaggle/input/petfinder-adoption-prediction/train_metadata/2ead80817-2.json
/kaggle/input/petfinder-adoption-prediction/train_metadata/6f2721c52-9.json
/kaggle/input/petfinder-adoption-prediction/train_metadata/a63a0ac25-3.json
/kaggle/input/petfinder-adoption-prediction/train_metadata/d1be4f433-1.json
/kaggle/input/petfinder-adoption-prediction/train_metadata/af6364af9-5.json
/kaggle/input/petfinder-adoption-prediction/tra

KeyboardInterrupt: 

In [2]:
# ===================================================================
# 1. SETUP & INITIALIZATION
# ===================================================================
import os
import json
import warnings
import numpy as np
import pandas as pd
import scipy as sp
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings("ignore")

# --- IMPORTANT: SET YOUR BASE PATH HERE ---
# If you place this script inside the 'PetFinder.my Adoption Prediction' folder,
# the default './' is correct.
# If the script is outside, change it to './PetFinder.my Adoption Prediction/'.
BASE_PATH = '/kaggle/input/petfinder-adoption-prediction/'
# ------------------------------------------

# Define file paths using the BASE_PATH
TRAIN_PATH = os.path.join(BASE_PATH, 'train/train.csv')
TEST_PATH = os.path.join(BASE_PATH, 'test/test.csv')

# ===================================================================
# 2. DATA LOADING & HELPER FUNCTIONS
# ===================================================================
print("Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_submission = pd.read_csv(os.path.join(BASE_PATH, 'test/sample_submission.csv'))

def load_and_parse_json_data(pet_ids, data_type='metadata'):
    """
    Loads and parses JSON files (metadata or sentiment) for a given list of pet IDs.
    
    Args:
        pet_ids (list): List of PetIDs to process.
        data_type (str): 'metadata' or 'sentiment'.

    Returns:
        pd.DataFrame: A dataframe containing parsed features.
    """
    features = []
    # Determine the correct folder based on whether we are processing train or test data
    folder = f'train_{data_type}/' if pet_ids.name == 'train' else f'test_{data_type}/'
    
    for pet_id in tqdm(pet_ids, desc=f"Parsing {data_type} JSONs"):
        file_path = os.path.join(BASE_PATH, folder, f'{pet_id}-1.json')
        
        pet_features = {'PetID': pet_id}
        
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            if data_type == 'sentiment' and 'documentSentiment' in data:
                pet_features['sentiment_magnitude'] = data['documentSentiment'].get('magnitude', 0)
                pet_features['sentiment_score'] = data['documentSentiment'].get('score', 0)
            
            elif data_type == 'metadata':
                if 'labelAnnotations' in data:
                    labels = [ann['description'] for ann in data['labelAnnotations']]
                    scores = [ann['score'] for ann in data['labelAnnotations']]
                    pet_features['annots_top_desc'] = labels[0] if labels else 'none'
                    pet_features['annots_mean_score'] = np.mean(scores) if scores else 0
                if 'imagePropertiesAnnotation' in data and 'dominantColors' in data['imagePropertiesAnnotation']:
                    colors = data['imagePropertiesAnnotation']['dominantColors']['colors']
                    pixel_fractions = [color['pixelFraction'] for color in colors]
                    pet_features['dominant_pixel_frac'] = pixel_fractions[0] if pixel_fractions else 0
        
        features.append(pet_features)
        
    return pd.DataFrame(features)

print("Processing JSON data (this might take a few minutes)...")
# Process training data JSONs
train_pet_ids = train_df['PetID'].rename('train') # Use rename to track origin
train_sentiment_df = load_and_parse_json_data(train_pet_ids, 'sentiment')
train_metadata_df = load_and_parse_json_data(train_pet_ids, 'metadata')

# Process testing data JSONs
test_pet_ids = test_df['PetID'].rename('test') # Use rename to track origin
test_sentiment_df = load_and_parse_json_data(test_pet_ids, 'sentiment')
test_metadata_df = load_and_parse_json_data(test_pet_ids, 'metadata')

# ===================================================================
# 3. FEATURE ENGINEERING
# ===================================================================
print("Starting feature engineering...")

# Combine train and test for consistent processing
all_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)

# Merge JSON features
train_metadata_df['PetID'] = train_pet_ids.values
test_metadata_df['PetID'] = test_pet_ids.values
train_sentiment_df['PetID'] = train_pet_ids.values
test_sentiment_df['PetID'] = test_pet_ids.values

all_metadata_df = pd.concat([train_metadata_df, test_metadata_df], ignore_index=True)
all_sentiment_df = pd.concat([train_sentiment_df, test_sentiment_df], ignore_index=True)

all_df = all_df.merge(all_metadata_df, on='PetID', how='left')
all_df = all_df.merge(all_sentiment_df, on='PetID', how='left')

# --- Tabular & Basic Features ---
all_df['Name'] = all_df['Name'].fillna('No Name')
all_df['Description'] = all_df['Description'].fillna('')
all_df['NoName'] = (all_df['Name'] == 'No Name').astype(int)
all_df['DescriptionLength'] = all_df['Description'].apply(len)
all_df['PureBreed'] = (all_df['Breed2'] == 0).astype(int)
all_df['PhotoAmt'] = all_df['PhotoAmt'].clip(0, 15) # Clip to reduce outlier effect
all_df['Fee_per_pet'] = all_df['Fee'] / (all_df['Quantity'] + 1e-6)

# --- Text Features (TF-IDF + SVD) ---
n_components_svd = 10
tfidf = TfidfVectorizer(min_df=3, max_features=10000, strip_accents='unicode',
                        analyzer='word', token_pattern=r'\w{1,}',
                        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)

svd = TruncatedSVD(n_components=n_components_svd, random_state=42)

text_features = tfidf.fit_transform(all_df['Description'])
text_svd_features = svd.fit_transform(text_features)
text_svd_features = pd.DataFrame(text_svd_features, columns=[f'SVD_{i}' for i in range(n_components_svd)])
all_df = pd.concat([all_df, text_svd_features], axis=1)

# --- Categorical Feature Handling ---
categorical_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
                    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
                    'Sterilized', 'Health', 'State', 'annots_top_desc']

for col in categorical_cols:
    all_df[col] = pd.Categorical(all_df[col]).codes

# --- Drop unnecessary columns ---
cols_to_drop = ['Name', 'RescuerID', 'Description', 'PetID']
final_df = all_df.drop(columns=cols_to_drop)

# Separate train and test again
X = final_df.loc[final_df['AdoptionSpeed'].notna()].drop(columns=['AdoptionSpeed'])
y = final_df.loc[final_df['AdoptionSpeed'].notna(), 'AdoptionSpeed'].astype(int)
X_test = final_df.loc[final_df['AdoptionSpeed'].isna()].drop(columns=['AdoptionSpeed'])

print("Feature engineering complete. Final feature shapes:")
print(f"X_train: {X.shape}, X_test: {X_test.shape}")

# ===================================================================
# 4. MODEL TRAINING (LightGBM with Cross-Validation)
# ===================================================================
print("\nStarting model training with 5-fold cross-validation...")

# LightGBM parameters - tuned for this competition
lgb_params = {
    'objective': 'multiclass',
    'num_class': 5,
    'boosting_type': 'gbdt',
    'n_estimators': 500,
    'learning_rate': 0.02,
    'num_leaves': 31,
    'max_depth': -1,
    'seed': 42,
    'n_jobs': -1,
    'verbose': -1,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
}

N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros((len(X), 5))
test_preds = np.zeros((len(X_test), 5))
feature_importances = pd.DataFrame(index=X.columns)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='multi_logloss',
              callbacks=[lgb.early_stopping(100, verbose=False)])
    
    oof_preds[val_idx] = model.predict_proba(X_val)
    test_preds += model.predict_proba(X_test) / N_SPLITS
    feature_importances[f'fold_{fold+1}'] = model.feature_importances_

oof_labels = np.argmax(oof_preds, axis=1)
qwk_score = cohen_kappa_score(y, oof_labels, weights='quadratic')
print(f"\nOverall OOF QWK score (with simple argmax): {qwk_score:.4f}")

# ===================================================================
# 5. PREDICTION OPTIMIZATION (Thresholding)
# ===================================================================
print("\nOptimizing prediction thresholds for QWK...")

class OptimizedRounder:
    """
    An optimizer for finding the best thresholds for rounding continuous predictions.
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        preds = pd.cut(X, [-np.inf] + list(coef) + [np.inf], labels=[0, 1, 2, 3, 4])
        return -cohen_kappa_score(y, preds, weights='quadratic')

    def fit(self, X, y):
        loss_partial = lambda coef: self._kappa_loss(coef, X, y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')['x']

    def predict(self, X):
        return pd.cut(X, [-np.inf] + list(self.coef_) + [np.inf], labels=[0, 1, 2, 3, 4])

# The model predicts classes, but QWK works better on a continuous scale.
# We create a "continuous" prediction by taking a weighted average of class probabilities.
oof_continuous_preds = np.sum(oof_preds * np.arange(5), axis=1)

optimizer = OptimizedRounder()
optimizer.fit(oof_continuous_preds, y)

optimized_thresholds = optimizer.coef_
print(f"Optimized thresholds found: {optimized_thresholds}")

# Apply thresholds to OOF predictions and check the new score
oof_preds_optimized = optimizer.predict(oof_continuous_preds)
optimized_qwk = cohen_kappa_score(y, oof_preds_optimized, weights='quadratic')
print(f"OOF QWK score after optimization: {optimized_qwk:.4f}")

# ===================================================================
# 6. SUBMISSION
# ===================================================================
print("\nGenerating final submission file...")

# Apply the same logic to test predictions
test_continuous_preds = np.sum(test_preds * np.arange(5), axis=1)
final_test_preds = optimizer.predict(test_continuous_preds).astype(int)

# Create submission file
submission_df = pd.DataFrame({
    'PetID': test_df['PetID'],
    'AdoptionSpeed': final_test_preds
})
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")
print(submission_df.head())
print("\nProgram finished.")

Loading data...
Processing JSON data (this might take a few minutes)...


Parsing sentiment JSONs: 100%|██████████| 14993/14993 [00:09<00:00, 1537.49it/s]
Parsing metadata JSONs: 100%|██████████| 14993/14993 [00:49<00:00, 301.24it/s]
Parsing sentiment JSONs: 100%|██████████| 3972/3972 [00:03<00:00, 1271.09it/s]
Parsing metadata JSONs: 100%|██████████| 3972/3972 [00:21<00:00, 189.06it/s]


Starting feature engineering...
Feature engineering complete. Final feature shapes:
X_train: (14993, 36), X_test: (3972, 36)

Starting model training with 5-fold cross-validation...
--- Fold 1/5 ---
--- Fold 2/5 ---
--- Fold 3/5 ---
--- Fold 4/5 ---
--- Fold 5/5 ---

Overall OOF QWK score (with simple argmax): 0.3698

Optimizing prediction thresholds for QWK...


ValueError: bins must increase monotonically.