In [72]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [73]:
df = pd.read_csv('outfits.csv')

# forward fill missing values
df = df.ffill()

# drop column '#'
df = df.drop(columns=['#'])

# normalise column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# convert dates
# strip spaces, normalise commas
df["date"] = df["date"].str.strip()
df["date"] = df["date"].str.replace(r"\s*,\s*", ",", regex=True)  # ensure exactly one comma, no spaces around


df["date"] = pd.to_datetime(df["date"], format="%B %d,%Y")
df['month'] = df['date'].dt.month
df['dayofweek'] = df['date'].dt.dayofweek

# split city, state, country
# if 2 commas - city, state, country
# if 1 comma - city, country


# Function to split properly
def split_city(x):
    parts = [p.strip() for p in x.split(',')]
    if len(parts) == 3:
        return pd.Series({'city': parts[0], 'state': parts[1], 'country': parts[2]})
    elif len(parts) == 2:
        return pd.Series({'city': parts[0], 'state': None, 'country': parts[1]})
    else:  # fallback
        return pd.Series({'city': parts[0], 'state': None, 'country': None})

# Apply function
df[['city', 'state', 'country']] = df['city'].apply(split_city)


df.head()

Unnamed: 0,city,date,lover_bodysuit,lover_jacket,lover_guitar,fearless_dress,evermore,red_t-shirt,red_guitar,speak_now,...,two_piece,coverall,surprise_song,midnights_shirt,bodysuit,karma_jacket,month,dayofweek,state,country
0,Glendale,2023-03-17,Pink and Blue,Silver,Purple,Fringe,Orange,A Lot,Red,Champagne,...,0,0,Pink,Silver Sequin,Navy,Multicolor,3,4,Arizona,USA
1,Glendale,2023-03-18,Blue and Gold,Black,Blue,Gold Noodle,Orange,TS - EW,Red,Pink Ball Gown,...,0,0,Green,Blue,Navy,Magenta,3,5,Arizona,USA
2,Las Vegas,2023-03-24,Pink and Blue,Silver,Purple,Fringe,Orange,A Lot,Red,Champagne,...,0,0,Pink,Silver Sequin,Blurple,Multicolor,3,4,Nevada,USA
3,Las Vegas,2023-03-25,Blue and Gold,Black,Pink,Gold Noodle,Orange,TS - EW,Red,Pink Ball Gown,...,0,0,Green,Blue,Navy,Pink,3,5,Nevada,USA
4,Arlington,2023-03-31,Blue and Gold,Black,Blue,Gold Noodle,Orange,WANEGBT,Red,Champagne,...,0,0,Green,Pink,Blurple,Multicolor,3,4,Texas,USA


In [74]:
# additional features
df['night_number'] = df.groupby('city').cumcount() + 1
df['overall_show_number'] = df.reset_index().index + 1

# previous night's lover bodysuit color
df['prev_lover_bodysuit'] = df.groupby('city')['lover_bodysuit'].shift(1)

# Fill first-night Bodysuit with the most common first-night Bodysuit in each city
def fill_first_night(group):
    first_show_bodysuit = group['lover_bodysuit'].iloc[0]
    group['prev_lover_bodysuit'] = group['prev_lover_bodysuit'].fillna(first_show_bodysuit)
    return group

df = df.groupby('city').apply(fill_first_night)


  df = df.groupby('city').apply(fill_first_night)


In [None]:
lover_bodysuit_release_dates = {
    'Pink and Blue': "2023-03-17",
    'Blue and Gold': "2023-03-18",
    'Purple Tassels': "2023-05-27", 
    'Barbie Pink': "2023-11-09",
    'Orange and Pink': "2024-05-09"
}


# convert to datetime
lover_bodysuit_release_dates = {k: pd.to_datetime(v) for k,v in lover_bodysuit_release_dates.items()}

# helper function: valid bodysuits for a given date
def valid_bodysuits_for_date(date):
    return [k for k,v in lover_bodysuit_release_dates.items() if pd.to_datetime(date) >= v]



In [None]:

X = df[['city', 'month', 'dayofweek', 'night_number', 'overall_show_number', 'prev_lover_bodysuit', 'state', 'country']]
y = df['lover_bodysuit']

# Encode categorical feature
X = pd.get_dummies(X, columns=['city', 'state', 'country', 'prev_lover_bodysuit'], drop_first=True)

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Time-based train-test split
train_mask = df['date'] < "2023-08-01"
X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y_encoded[train_mask], y_encoded[~train_mask]

# Model
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.4


In [90]:
results = pd.DataFrame({
    'date': df.loc[~train_mask, 'date'],
    'city': df.loc[~train_mask, 'city'],
    'night_number': df.loc[~train_mask, 'night_number'],
    'true_bodysuit': le.inverse_transform(y_test),
    'pred_bodysuit': le.inverse_transform(y_pred)
})

# Add a column showing if it was correct
results['correct'] = results['true_bodysuit'] == results['pred_bodysuit']

# Filter misclassified rows
misclassified = results[results['correct'] == False]

# Inspect
print(f"Number of misclassified rows: {len(misclassified)}")
misclassified.head(10)


Number of misclassified rows: 63


Unnamed: 0_level_0,Unnamed: 1_level_0,date,city,night_number,true_bodysuit,pred_bodysuit,correct
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Amsterdam,111,2024-07-06,Amsterdam,3,Orange and Pink,Pink and Blue,False
Buenos Aires,57,2023-11-09,Buenos Aires,1,Barbie Pink,Pink and Blue,False
Buenos Aires,58,2023-11-11,Buenos Aires,2,Pink and Blue,Blue and Gold,False
Dublin,106,2024-06-28,Dublin,1,Barbie Pink,Pink and Blue,False
Dublin,107,2024-06-29,Dublin,2,Purple Tassels,Blue and Gold,False
Edinburgh,98,2024-06-09,Edinburgh,3,Purple Tassels,Blue and Gold,False
Gelsenkirchen,116,2024-07-17,Gelsenkirchen,1,Barbie Pink,Pink and Blue,False
Gelsenkirchen,118,2024-07-19,Gelsenkirchen,3,Purple Tassels,Blue and Gold,False
Hamburg,120,2024-07-24,Hamburg,2,Orange and Pink,Blue and Gold,False
Indianapolis,138,2024-11-02,Indianapolis,2,Orange and Pink,Pink and Blue,False
