In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [30]:
df = pd.read_csv('outfits.csv')

# forward fill missing values
df = df.ffill()

# drop column '#'
df = df.drop(columns=['#'])

# normalise column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# convert dates
# strip spaces, normalise commas
df["date"] = df["date"].str.strip()
df["date"] = df["date"].str.replace(r"\s*,\s*", ",", regex=True)  # ensure exactly one comma, no spaces around


df["date"] = pd.to_datetime(df["date"], format="%B %d,%Y")
df['month'] = df['date'].dt.month
df['dayofweek'] = df['date'].dt.dayofweek

df.head()

Unnamed: 0,city,date,lover_bodysuit,lover_jacket,lover_guitar,fearless_dress,evermore,red_t-shirt,red_guitar,speak_now,...,1989.1,ttpd_dress,two_piece,coverall,surprise_song,midnights_shirt,bodysuit,karma_jacket,month,dayofweek
0,"Glendale, Arizona, USA",2023-03-17,Pink and Blue,Silver,Purple,Fringe,Orange,A Lot,Red,Champagne,...,,0,0,0,Pink,Silver Sequin,Navy,Multicolor,3,4
1,"Glendale, Arizona, USA",2023-03-18,Blue and Gold,Black,Blue,Gold Noodle,Orange,TS - EW,Red,Pink Ball Gown,...,,0,0,0,Green,Blue,Navy,Magenta,3,5
2,"Las Vegas, Nevada, USA",2023-03-24,Pink and Blue,Silver,Purple,Fringe,Orange,A Lot,Red,Champagne,...,,0,0,0,Pink,Silver Sequin,Blurple,Multicolor,3,4
3,"Las Vegas, Nevada, USA",2023-03-25,Blue and Gold,Black,Pink,Gold Noodle,Orange,TS - EW,Red,Pink Ball Gown,...,,0,0,0,Green,Blue,Navy,Pink,3,5
4,"Arlington, Texas, USA",2023-03-31,Blue and Gold,Black,Blue,Gold Noodle,Orange,WANEGBT,Red,Champagne,...,,0,0,0,Green,Pink,Blurple,Multicolor,3,4


In [32]:

X = df[['city', 'month', 'dayofweek']]
y = df['lover_bodysuit']

# Encode categorical feature
X = pd.get_dummies(X, columns=['city'])

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Time-based train-test split
train_mask = df['date'] < "2023-08-01"
X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y_encoded[train_mask], y_encoded[~train_mask]

# Model
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.2761904761904762
