In [1]:
# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set(style="whitegrid")
%matplotlib inline

print("✅ Libraries imported successfully")

# --- 2. Load Raw Data ---
df = pd.read_csv("data/raw_susdenim_cons_survey.csv")
print("✅ Raw data loaded")
print(f"Columns in raw data: {df.columns.tolist()}")

# --- 3. Keep Only Necessary Columns ---
cols_to_keep = [
    "age", "education_level", "gender", "income_level", "jeanstype",
    "data.djcb[0].answer", "data.djcb[1].answer", "data.djcb[2].answer", "data.djcb[3].answer",
    "data.djcb[4].answer[0]", "data.djcb[4].answer[1]", "data.djcb[4].answer[2]",
    "data.djcb[4].answer[3]", "data.djcb[4].answer[4]", "data.djcb[4].answer[5]",
    "data.djcb[4].answer[6]", "data.djcb[4].customAnswer",
    "data.djcb2[0].answer", "data.djcb2[1].answer", "data.djcb2[2].answer",
    "data.djcb2[3].answer", "data.djcb2[4].answer", "data.djcb2[5].answer"
]

df = df[cols_to_keep]
print("\n✅ Kept only relevant columns")

# --- 4. Define Ordinal Mappings for Education & Income ---
education_map = {
    'High school': 1,
    'College/University': 2,
    'Master’s Degree/Phd': 3
}

income_map = {
    '<=17002': 1,
    '17003-34004': 2,
    '34005-51006': 3,
    '51007-68008': 4,
    '68009-85010': 5,
    '85011-102012': 6,
    '>=102013': 7
}

gender_map = {
    'male': 0,
    'female': 1
}

# Apply mappings
df['education_level'] = df['education_level'].map(education_map)
df['income_level'] = df['income_level'].map(income_map)
df['gender'] = df['gender'].map(gender_map)

# Verify mapping
print("\n✅ Ordinal encoding applied:")
print("Unique education levels mapped:", df['education_level'].unique())
print("Unique income levels mapped:", df['income_level'].unique())
print("Unique gender values mapped:", df['gender'].unique())

# --- 5. Rename Survey Question Columns ---
column_rename_map = {
    # Purchase Behavior
    'data.djcb[0].answer': 'purchase_frequency',
    'data.djcb[1].answer': 'purchase_channel',
    'data.djcb[2].answer': 'brand_preference',
    'data.djcb[3].answer': 'sustainable_purchase_history',

    # Barriers (multi-select)
    'data.djcb[4].answer[0]': 'barrier_awareness',
    'data.djcb[4].answer[1]': 'barrier_unimportant',
    'data.djcb[4].answer[2]': 'barrier_doubt',
    'data.djcb[4].answer[3]': 'barrier_new_concept',
    'data.djcb[4].answer[4]': 'barrier_style',
    'data.djcb[4].answer[5]': 'barrier_habit',
    'data.djcb[4].answer[6]': 'barrier_variety',
    'data.djcb[4].customAnswer': 'barrier_other_text',

    # Sustainability Feature Importance (Likert Scale)
    'data.djcb2[0].answer': 'importance_durability',
    'data.djcb2[1].answer': 'importance_organic_materials',
    'data.djcb2[2].answer': 'importance_recycled_materials',
    'data.djcb2[3].answer': 'importance_biodegradable',
    'data.djcb2[4].answer': 'importance_eco_friendly_dyeing',
    'data.djcb2[5].answer': 'importance_ethical_production'
}

df.rename(columns=column_rename_map, inplace=True)

# Optional: Show updated columns
print("\n✅ Columns after renaming:")
print(df.columns.tolist())





df.to_csv("data/cleaned_denim_survey_for_portfolio_2.csv", index=False)
print("\n✅ Final dataset saved at:", os.path.abspath("data/cleaned_denim_survey_for_portfolio.csv"))

✅ Libraries imported successfully
✅ Raw data loaded
Columns in raw data: ['duration', 'age', 'education_level', 'gender', 'income_level', 'group', 'jeanstype', 'choice', 'F_WTP', 'S_WTP', 'data.rcb[0].id', 'data.rcb[1].id', 'data.rcb[2].id', 'data.rcb[3].id', 'data.rcb[4].id', 'data.rcb[5].id', 'data.rcb[0].answer', 'data.rcb[1].answer', 'data.rcb[2].answer', 'data.rcb[3].answer', 'data.rcb[4].answer', 'data.rcb[5].answer', 'data.fi[0].id', 'data.fi[1].id', 'data.fi[2].id', 'data.fi[3].id', 'data.fi[4].id', 'data.fi[5].id', 'data.fi[0].answer', 'data.fi[1].answer', 'data.fi[2].answer', 'data.fi[3].answer', 'data.fi[4].answer', 'data.fi[5].answer', 'data.sfa[0].id', 'data.sfa[1].id', 'data.sfa[2].id', 'data.sfa[3].id', 'data.sfa[4].id', 'data.sfa[5].id', 'data.sfa[6].id', 'data.sfa[7].id', 'data.sfa[8].id', 'data.sfa[0].answer', 'data.sfa[1].answer', 'data.sfa[2].answer', 'data.sfa[3].answer', 'data.sfa[4].answer', 'data.sfa[5].answer', 'data.sfa[6].answer', 'data.sfa[7].answer', 'data.