# Prep dataset

In [None]:
!wget https://data.caltech.edu/records/65de6-vp158/files/CUB_200_2011.tgz?download=1

In [None]:
!tar -xvzf CUB_200_2011.tgz?download=1

In [None]:
# download clean attributes (image_attribute_labels_clean.txt) from here:
https://www.kaggle.com/datasets/wenewone/cub2002011?resource=download

# Fit linear model

In [5]:
import pandas as pd
import numpy as np
from os.path import join
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
import joblib
from collections import defaultdict
from joblib import Memory
import time
memory = Memory(location='./CUB_200_cache', verbose=0)


CUB_DSET_DIR = 'CUB_200_2011'
# Load attribute labels
# class_labels = pd.read_csv(join(CUB_DSET_DIR, 'classes.txt'),
#    sep=' ', header=None, names=['class_id', 'class_name'])
image_attribute_labels_clean = pd.read_csv(join(CUB_DSET_DIR, 'attributes/image_attribute_labels_clean.txt'),
                                           sep=' ', header=None, names=['image_id', 'attribute_id', 'is_present', 'certainty_id', 'time'])

# Load image-to-class mappings
image_class_labels = pd.read_csv(join(
    CUB_DSET_DIR, 'image_class_labels.txt'), sep=' ', header=None, names=['image_id', 'class_id'])
image_classes = image_class_labels['class_id'].to_numpy()

# Create a matrix of size num_images x num_attributes
num_images = 11788
num_attributes = 312

In [None]:
# Initialize a matrix with zeros
matrix = np.zeros((num_images, num_attributes), dtype=int)

# Populate the matrix with `is_present` values
for _, row in tqdm(image_attribute_labels_clean.iterrows(), total=len(image_attribute_labels_clean)):
    image_index = int(row['image_id'] - 1)  # Convert to 0-based index
    attribute_index = int(row['attribute_id'] - 1)  # Convert to 0-based index
    matrix[image_index, attribute_index] = row['is_present']

# Convert to DataFrame for easy visualization
# matrix_df = pd.DataFrame(matrix, columns=[f"Attribute_{i+1}" for i in range(num_attributes)],
    #  index=[f"Image_{i+1}" for i in range(num_images)])
joblib.dump(matrix, 'cub_matrix.pkl')

In [None]:
matrix = joblib.load('cub_matrix.pkl')

In [None]:
# these are 1-indexed
CATEGORIES = {
    'Albatross': [1, 2, 3],
    'Auklet': [5, 6, 7, 8],
    'Blackbird': [9, 10, 11, 12],
    'Bunting': [14, 15, 16],
    'Flycatcher': [37, 38, 39, 40, 41, 42, 43],
    'Sparrow': [113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133],
    'Tern': [141, 142, 143, 144, 145, 146, 147],
}

In [None]:
@memory.cache
def get_auc(X, y, random_state=42, model='logistic'):
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=random_state)

    # Train a logistic regression model
    if model == 'logistic':
        m = LogisticRegression(
            max_iter=1000, solver='lbfgs', random_state=random_state)
        m.fit(X_train, y_train)
        frac_zero_coefs = np.mean(
            np.abs(m.coef_) == 0)
    elif model == 'tree':
        m = DecisionTreeClassifier(max_depth=4)
        m.fit(X_train, y_train)
        frac_zero_coefs = np.mean(
            np.abs(m.feature_importances_) == 0)
    elif model == 'lassocv':
        m = LogisticRegressionCV(
            max_iter=200,
            random_state=random_state,
            penalty='l1',
            Cs=3,
            solver='saga',
            cv=3
        )
        m.fit(X_train, y_train)
        frac_zero_coefs = np.mean(
            np.abs(m.coef_) == 0)

    y_proba = m.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
    return auc, frac_zero_coefs


d = defaultdict(list)
for model in ['lassocv', 'logistic', 'tree']:
    for category in CATEGORIES:
        X = matrix[:, np.array(CATEGORIES[category]) - 1]
        y = image_classes
        for random_state in tqdm(range(5)):
            auc, frac_zero_coefs = get_auc(
                X, y, random_state=random_state, model=model)
            d['auc'].append(auc)
            d['frac_zero_coefs'].append(frac_zero_coefs)
            d['model'].append(model)
            d['category'].append(category)

        df = pd.DataFrame(d)
        display(df)

    df.to_pickle(f'CUB_200_results_{model}.pkl')

    # print(f"{category} AUC: {np.mean(aucs):.4f} ± {1.96 * np.std(aucs):.4f}")

  0%|          | 0/5 [00:00<?, ?it/s]