# Fashion Survey Data Exploration

This notebook explores the fashion survey data to understand patterns in sock preferences based on demographics, personality traits (MBTI), and fashion attitudes.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

We'll load the integrated dataset that contains both achromatic and chromatic sock preferences.

In [None]:
# Load the integrated dataset
df = pd.read_csv('../data/raw/fashion_survey_integrated.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

## Data Overview

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

print("\nData types:")
print(df.dtypes)

print("\nBasic statistics:")
df.describe()

## Demographics Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Gender distribution
gender_counts = df['gender'].value_counts()
axes[0].bar(['Male', 'Female'], gender_counts.values)
axes[0].set_title('Gender Distribution')
axes[0].set_ylabel('Count')

# Age distribution
axes[1].hist(df['age'], bins=20, edgecolor='black')
axes[1].set_title('Age Distribution')
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Count')

# Target group distribution
target_counts = df['target_group'].value_counts().sort_index()
axes[2].bar(target_counts.index, target_counts.values)
axes[2].set_title('Target Group Distribution')
axes[2].set_xlabel('Target Group')
axes[2].set_ylabel('Count')
axes[2].set_xticks(range(1, 5))

plt.tight_layout()
plt.show()

## MBTI Distribution

In [None]:
# MBTI type distribution
mbti_cols = ['mbti_e_i', 'mbti_s_n', 'mbti_t_f', 'mbti_j_p']
mbti_labels = ['E/I', 'S/N', 'T/F', 'J/P']

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()

for i, (col, label) in enumerate(zip(mbti_cols, mbti_labels)):
    counts = df[col].value_counts().sort_index()
    axes[i].bar([label.split('/')[1], label.split('/')[0]], counts.values)
    axes[i].set_title(f'MBTI: {label}')
    axes[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Fashion Attitudes

In [None]:
# Fashion attitude scores
attitude_cols = [
    'individuality_orientation',
    'ostentation_orientation',
    'sports_orientation',
    'clothing_practicality',
    'clothing_appearance',
    'fashion_involvement'
]

df[attitude_cols].boxplot(figsize=(14, 6))
plt.title('Fashion Attitude Distributions')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Sock Preference Analysis

In [None]:
# Achromatic sock preferences
achromatic_cols = [col for col in df.columns if 'achromatic_sock_' in col]
achromatic_prefs = df[achromatic_cols]

print("Achromatic Sock Preference Summary:")
print(f"Number of options: {len(achromatic_cols)}")
print(f"\nRanking frequency (non-zero):")
for col in achromatic_cols:
    non_zero = (achromatic_prefs[col] > 0).sum()
    print(f"  {col}: {non_zero} respondents ({non_zero/len(df)*100:.1f}%)")

In [None]:
# Chromatic sock preferences
color_cols = [col for col in df.columns if 'color_sock_' in col]
color_prefs = df[color_cols]

print("Chromatic Sock Preference Summary:")
print(f"Number of options: {len(color_cols)}")
print(f"\nRanking frequency (non-zero):")
for col in color_cols:
    non_zero = (color_prefs[col] > 0).sum()
    print(f"  {col}: {non_zero} respondents ({non_zero/len(df)*100:.1f}%)")

## Correlation Analysis

In [None]:
# Select key features for correlation
key_features = [
    'age', 'gender',
    'individuality_orientation', 'ostentation_orientation',
    'sports_orientation', 'clothing_practicality',
    'clothing_appearance', 'fashion_involvement',
    'target_group'
]

correlation_matrix = df[key_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Key Features')
plt.tight_layout()
plt.show()

## Target Group Characteristics

In [None]:
# Compare fashion attitudes across target groups
attitude_cols = [
    'individuality_orientation',
    'ostentation_orientation',
    'sports_orientation',
    'fashion_involvement'
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(attitude_cols):
    df.boxplot(column=col, by='target_group', ax=axes[i])
    axes[i].set_title(col.replace('_', ' ').title())
    axes[i].set_xlabel('Target Group')
    axes[i].set_ylabel('Score')

plt.suptitle('Fashion Attitudes by Target Group', y=1.02, fontsize=14)
plt.tight_layout()
plt.show()

## Summary

Key findings from the data exploration:

1. **Demographics**: The dataset has a balanced distribution across age groups with slight female majority
2. **MBTI**: Personality types show expected distributions from population studies
3. **Fashion Attitudes**: Fashion involvement and clothing practicality are generally high
4. **Sock Preferences**: Both achromatic and chromatic options have varied preferences
5. **Target Groups**: Four distinct groups emerge based on color and style preferences

These patterns will inform our recommendation model development.