In [507]:
# Install missing plotting libraries
%pip install matplotlib seaborn

# Import relevant libaries
import pandas as pd
import os
from pathlib import Path
import re
import ast

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [508]:
# Load the participants_rows.csv file
data_dir = Path('google_exports')
df_surveys_final = pd.read_csv(data_dir / 'df_surveys_with_timing_filtered.csv')

df_surveys_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 55 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   participant_id              52 non-null     object 
 1   background_submitted_at     52 non-null     object 
 2   q2_gender                   52 non-null     object 
 3   q1_age_group                52 non-null     object 
 4   q3_education                52 non-null     object 
 5   q5_nationality              52 non-null     object 
 6   q7_ai_familiarity           52 non-null     int64  
 7   q8_attention_check          52 non-null     int64  
 8   q10_additional_info         52 non-null     object 
 9   q4_employment_status        52 non-null     object 
 10  q6_country_residence        52 non-null     object 
 11  q9_ai_usage_frequency       52 non-null     object 
 12  q11_response                52 non-null     object 
 13  instruction_submitted_at    52 non-nu

## Data Cleaning and Inconsistencies Handling for surveys datasets

In [509]:
def clean_nationality(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # --- Manual fixes for common misspellings ---
    fixes = {
        "autrian": "austrian",
        "grrek": "greek",
        "kazakh ": "kazakh",
        "ukrainian": "ukrainian",
        "latvian": "latvian",
        "swiss": "switzerland",
        "de": "germany",
        "usa": "usa",
        "us": "usa"
    }
    if text in fixes:
        text = fixes[text]

    # --- Mapping from nationality to country ---
    mapping = {
        "uk": "UK",
        "british": "UK",
        "united kingdom": "UK",
        "scottish": "UK",
        "Uk": "UK",

        "german": "Germany",
        "deutschland": "Germany",

        "swiss": "Switzerland",
        "switzerland": "Switzerland",

        "vietnamese": "Vietnam",
        "vietnam": "Vietnam",

        "vietnamese-german": "Germany",   # choose Vietnam (or Germany?)

        "kazakh": "Kazakhstan",
        "kazakhstan": "Kazakhstan",

        "latvian": "Latvia",
        "latvia": "Latvia",

        "italian": "Italy",
        "italy": "Italy",

        "czech": "Czech Republic",
        "czechia": "Czech Republic",

        "libyan": "Libya",
        "libya": "Libya",

        "austrian": "Austria",
        "austria": "Austria",

        "american": "USA",
        "us": "USA",
        "usa": "USA",
        "united states": "USA",
        "america": "USA",

        "china": "China",
        "chinese": "China",

        "french": "France",
        "france": "France",

        "sri lankan": "Sri Lanka",
        "sri lanka": "Sri Lanka",

        "ukrainian": "Ukraine",
        "ukraine": "Ukraine",

        "taiwan": "Taiwan",
        "taiwanese": "Taiwan",

        "dutch": "Netherlands",
        "netherlands": "Netherlands",
        "nederland": "Netherlands",

        "lithuanian": "Lithuania",
        "lithuania": "Lithuania",

        "hungarian": "Hungary",
        "hungary": "Hungary",

        "sweden": "Sweden",
        "swedish": "Sweden",

        "turkish": "Turkey",
        "turkey": "Turkey",

        "indian": "India",
        "india": "India",

        "filipino": "Philippines",
        "philippines": "Philippines",

        "brazilian": "Brazil",
        "brazil": "Brazil",

        "colombian": "Colombia",
        "colombia": "Colombia",

        "greek": "Greece",
        "greece": "Greece",
    }

    # If exact match in mapping
    if text in mapping:
        return mapping[text]

    # If value contains multiple nationalities ("Vietnamese-german", "greek Italian")
    parts = [p.strip() for p in text.replace("-", " ").split()]
    for p in parts:
        if p in mapping:
            return mapping[p]

    return None  # if nothing matches


# ✔ APPLY DIRECTLY TO df_surveys_final
df_surveys_final["q5_nationality"] = df_surveys_final["q5_nationality"].apply(clean_nationality)


In [510]:
def clean_country(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # Mapping of valid residence values → standardized country names
    country_map = {
        # UK & variants
        "uk": "UK",
        "united kingdom": "UK",
        "england": "UK",
        "scotland": "UK",
        "britain": "UK",

        # USA & variants
        "usa": "USA",
        "united states": "USA",
        "united states of america": "USA",
        "america": "USA",

        # Germany & variants
        "germany": "Germany",
        "de": "Germany",

        # Switzerland
        "switzerland": "Switzerland",

        # Czech Republic
        "czech republic": "Czech Republic",
        "czechia": "Czech Republic",

        # Netherlands
        "netherlands": "Netherlands",
        "the netherlands": "Netherlands",

        # Italy
        "italy": "Italy",

        # France
        "france": "France",

        # Russia
        "russia": "Russia",

        # China
        "china": "China",

        # Vietnam
        "vietnam": "Vietnam",
        "ho chi minh city": "Vietnam",

        # Taiwan
        "taiwan": "Taiwan",

        # Latvia
        "latvia": "Latvia",

        # Lithuania
        "lithuania": "Lithuania",

        # Singapore
        "singapore": "Singapore",

        # Slovakia
        "slovakia": "Slovakia",

        # Hungary
        "hungary": "Hungary",

        # Sweden
        "sweden": "Sweden",

        # Greece
        "greece": "Greece",

        # India
        "india": "India",

        # Philippines
        "philippines": "Philippines",

        # Canada
        "canada": "Canada"
    }

    # Exact match after cleaning
    if text in country_map:
        return country_map[text]

    # Fallback: Title Case (handles already-correct country names)
    return text.title()


# Apply directly to df_surveys_final
df_surveys_final["q6_country_residence"] = df_surveys_final["q6_country_residence"].apply(clean_country)


### Column q10_additional_info

In [511]:
# Column name
col = "q10_additional_info"

# -----------------------------------------
# Helper functions
# -----------------------------------------

def clean_item(item):
    """Clean individual tool name based on your rules."""
    if not item or pd.isna(item):
        return None

    item = item.strip()

    # Standardize common misspellings
    corrections = {
        r"\bgoogle\b": "Google",
        r"\bamazon\b": "Amazon",
        r"\bshoppe\b": "Shopee",
        r"\bshopee\b": "Shopee",
        r"\bwalmart\b": "Walmart",
        r"\bshein\b": "Shein",
        r"\btiktok\b": "TikTok",
        r"chat ?gpt": "ChatGPT",
    }
    for pattern, replacement in corrections.items():
        item = re.sub(pattern, replacement, item, flags=re.IGNORECASE)

    # Standardize price comparison terms
    if re.search(r"price\s*compar", item, flags=re.IGNORECASE):
        return "Price comparison sites"

    # Capitalize non-brand general terms
    if item.isalpha() and item.lower() not in [
        "google", "amazon", "shopee", "etsy", "ebay", "vinted",
        "facebook", "tiktok", "walmart", "shein", "taobao"
    ]:
        item = item.capitalize()

    return item


def split_items(text):
    """Split a cell into separate cleaned items."""
    if pd.isna(text):
        return []

    # Split by commas, slashes, and/or "and", "or"
    parts = re.split(r"[,/]| and | or ", str(text), flags=re.IGNORECASE)

    cleaned = []
    for p in parts:
        p = clean_item(p)
        if p:
            cleaned.append(p)

    # Remove duplicates while keeping order
    unique = []
    for x in cleaned:
        if x not in unique:
            unique.append(x)

    return unique

# -----------------------------------------
# PROCESS THE COLUMN IN PLACE
# -----------------------------------------

# Create a list column with cleaned + split items
df_surveys_final["splitted"] = df_surveys_final[col].apply(split_items)

# Find max number of items in any row
max_len = df_surveys_final["splitted"].apply(len).max()

# Create q10_tool_1 ... q10_tool_n directly in df_surveys_final
for i in range(1, max_len + 1):
    df_surveys_final[f"q10_tool_{i}"] = df_surveys_final["splitted"].apply(
        lambda x: x[i-1] if len(x) >= i else None
    )

# Remove original and helper columns
df_surveys_final.drop(columns=[col, "splitted"], inplace=True)

# Updates to apply: index 16 -> q10_tool_1 "RedNote", q10_tool_2 "Taobao"
#                   index 29 -> q10_tool_1 "Amazon",  q10_tool_2 "Shein"
updates = {
    16: ("RedNote", "Taobao"),
    29: ("Amazon", "Shein"),
}

for idx, (t1, t2) in updates.items():
    expanded_df.at[idx, "q10_tool_1"] = t1
    expanded_df.at[idx, "q10_tool_2"] = t2

# If the original dataframe also contains these columns and matching indices, update it too
if "df_surveys_final" in globals():
    needed = {"q10_tool_1", "q10_tool_2"}
    if needed.issubset(set(df_surveys_final.columns)):
        for idx, (t1, t2) in updates.items():
            if idx in df_surveys_final.index:
                df_surveys_final.at[idx, "q10_tool_1"] = t1
                df_surveys_final.at[idx, "q10_tool_2"] = t2

### Column 14: q12_smartphone_model

In [512]:
col = "q12_smartphone_model"

# -----------------------------------------
# 1. Standardization Helpers
# -----------------------------------------

def clean_text(text):
    if pd.isna(text):
        return None
    text = text.strip()
    text = re.sub(r",\s*\(", " (", text)     # fix commas before parentheses
    text = re.sub(r"\s+", " ", text)         # collapse multiple spaces
    return text


def extract_parenthesis_comment(text):
    match = re.search(r"\((.*?)\)", text)
    return f"({match.group(1).strip()})" if match else ""


def normalize_iphone(model):
    model = re.sub(r"iphone", "Apple iPhone", model, flags=re.IGNORECASE)

    # Normalize things like "Apple iPhone 17 pro max"
    return re.sub(
        r"apple iphone\s*(\d+)\s*(pro max|pro|plus|max|air|e)?",
        lambda m: "Apple iPhone " + m.group(1) + (" " + m.group(2).title() if m.group(2) else ""),
        model,
        flags=re.IGNORECASE
    ).strip()


def normalize_samsung(model):
    model = re.sub(r"(samsung\s*galaxy|galaxy|samsung)", "Samsung Galaxy", model, flags=re.IGNORECASE)

    # Normalize model variants (FE, Ultra, etc.)
    return re.sub(
        r"samsung galaxy\s*(s?\d+[a-z]*\s*(ultra|fe|max)?)",
        lambda m: "Samsung Galaxy " + m.group(1).upper().replace("ULTRA", "Ultra").replace("FE", "FE"),
        model,
        flags=re.IGNORECASE
    ).strip()


def normalize_pixel(model):
    return re.sub(r"(google pixel|pixel)", "Google Pixel", model, flags=re.IGNORECASE).strip()


def normalize_xiaomi(model):
    model = re.sub(r"\bxiaomi\b", "Xiaomi", model, flags=re.IGNORECASE)
    model = re.sub(r"\bredmi\b", "Redmi", model, flags=re.IGNORECASE)
    return model.strip()


def normalize_model(text):
    """Main standardization pipeline."""
    if pd.isna(text):
        return None

    original = clean_text(text)
    par_comment = extract_parenthesis_comment(original)

    # Remove parentheses before processing
    text = re.sub(r"\(.*?\)", "", original).strip()
    temp = text.lower()

    if "iphone" in temp or temp.startswith("apple"):
        text = normalize_iphone(text)
    elif "samsung" in temp or "galaxy" in temp:
        text = normalize_samsung(text)
    elif "pixel" in temp:
        text = normalize_pixel(text)
    elif "xiaomi" in temp or "redmi" in temp:
        text = normalize_xiaomi(text)

    # Fix standalone "Apple"
    if text.lower() == "apple":
        text = "Apple iPhone"

    # Add back cleaned comment
    if par_comment:
        par_comment = par_comment.replace("renewed on amazon", "Renewed").replace("refurbished", "Refurbished")
        text = f"{text} {par_comment}"

    return text.strip()


# -----------------------------------------
# 2. Extract Brand Only
# -----------------------------------------

def extract_brand(text):
    if pd.isna(text):
        return None

    t = text.lower().strip()

    brand_map = {
        r"\bapple\b": "Apple",
        r"\biphone\b": "Apple",
        r"\bsamsung\b": "Samsung",
        r"\bgalaxy\b": "Samsung",
        r"\bgoogle pixel\b": "Google",
        r"\bpixel\b": "Google",
        r"\bxiaomi\b": "Xiaomi",
        r"\bredmi\b": "Redmi",
        r"\brealme\b": "Realme",
        r"\bnuu\b": "NUU",
        r"\bsony\b": "Sony",
        r"\bmotorola\b": "Motorola",
        r"\boppo\b": "Oppo",
        r"\bvivo\b": "Vivo",
        r"\bhuawei\b": "Huawei",
        r"\boneplus\b": "OnePlus",
    }

    for pattern, brand in brand_map.items():
        if re.search(pattern, t, flags=re.IGNORECASE):
            return brand

    # fallback: first word
    return text.split()[0].capitalize()


# -----------------------------------------
# 3. Apply to df_surveys_final
# -----------------------------------------

# Standardize smartphone models
df_surveys_final[col] = df_surveys_final[col].apply(normalize_model)

# Extract brand-only column
df_surveys_final["q12_brand_only"] = df_surveys_final[col].apply(extract_brand)

# Insert brand column right after q12_smartphone_model
insert_pos = df_surveys_final.columns.get_loc(col) + 1
df_surveys_final.insert(insert_pos, "q12_brand_only", df_surveys_final.pop("q12_brand_only"))


### Column 15: q13_storage_capacity

In [513]:
col = "q13_storage_capacity"   # your column name

def standardize_storage(value):
    if pd.isna(value):
        return None

    text = str(value).lower()

    # Remove irrelevant words but keep digits and units
    text = text.replace(",", " ")

    # Extract the first storage-like pattern:
    # - number + optional unit
    # - handles: 64, 128gb, 256 gb, 1tb, 512-gb, etc.
    match = re.search(r"(\d+)\s*(gb|tb)?", text, flags=re.IGNORECASE)

    if not match:
        return None

    num = match.group(1)
    unit = match.group(2)

    # If no unit → default to GB
    if unit is None:
        unit = "GB"

    # Normalize unit casing
    unit = unit.upper()

    # Final standardized output
    return f"{num} {unit}"


# Apply to the dataframe directly
df_surveys_final[col] = df_surveys_final[col].apply(standardize_storage)


### Column 16: q14_color

In [514]:
col = "q14_color"   # your column name

def clean_and_split_colors(text):
    if pd.isna(text):
        return []

    text = str(text).strip()

    # -----------------------------------------
    # 1. Remove irrelevant phrases like "Color - "
    # -----------------------------------------
    text = re.sub(r"color[\s:-]*", "", text, flags=re.IGNORECASE)

    # -----------------------------------------
    # 2. Replace any version of "I don't know" with empty field
    # -----------------------------------------
    if re.search(r"i\s*don'?t\s*know", text, flags=re.IGNORECASE):
        return []   # return empty list → empty columns

    # -----------------------------------------
    # 3. Normalize separators
    # -----------------------------------------
    text = re.sub(r"[/&|-]+", ",", text)  # /, -, &, |
    text = re.sub(r"\band\b", ",", text, flags=re.IGNORECASE)

    # Split by comma
    parts = [p.strip() for p in text.split(",") if p.strip()]

    cleaned = []
    for p in parts:
        # Capitalize each word
        words = p.split()
        cleaned_words = [w.capitalize() for w in words]
        cleaned.append(" ".join(cleaned_words))

    return cleaned
# -----------------------------------------
# Create a list column
# -----------------------------------------
df_surveys_final["q14_split"] = df_surveys_final[col].apply(clean_and_split_colors)

# -----------------------------------------
# Find max number of colors
# -----------------------------------------
max_len = df_surveys_final["q14_split"].apply(len).max()

# -----------------------------------------
# Create q14_color_1 ... q14_color_n
# -----------------------------------------
for i in range(1, max_len + 1):
    df_surveys_final[f"q14_color_{i}"] = df_surveys_final["q14_split"].apply(
        lambda lst: lst[i-1] if len(lst) >= i else None
    )

# -----------------------------------------
# Remove helper column & overwrite main column with cleaned first entry
# -----------------------------------------
df_surveys_final.drop(columns=["q14_split"], inplace=True)

df_surveys_final[col] = df_surveys_final["q14_color_1"]



### Column 17: q15_lowest_price

In [515]:
df = df_surveys_final
old_col = "q15_lowest_price"

USD_TO_EUR = 0.866
GBP_TO_EUR = 1.17


def parse_price_to_eur(val):
    if pd.isna(val):
        return None

    text = str(val).strip()

    if re.search(r"did not find a price", text, flags=re.IGNORECASE):
        return None

    # Clean basic formatting
    clean = (
        text.replace("€", "")
            .replace("eur", "")
            .replace("euros", "")
            .replace("euro", "")
            .replace(",", ".")
            .strip()
    )

    # GBP detection
    if text.startswith("£"):
        try:
            num = float(clean)
            return round(num * GBP_TO_EUR, 2)
        except:
            return None

    # USD detection
    if text.startswith("$") or "usd" in text.lower():
        try:
            num = float(re.sub(r"[^\d.]", "", clean))
            return round(num * USD_TO_EUR, 2)
        except:
            return None

    # EUR or plain numeric
    number_only = re.sub(r"[^\d.]", "", clean)

    try:
        return round(float(number_only), 2)
    except:
        return None


def assign_price_range(eur_val):
    if eur_val is None:
        return None
    if eur_val < 150:
        return "Under €150"
    elif eur_val < 300:
        return "€150-299"
    elif eur_val < 450:
        return "€300-449"
    elif eur_val < 600:
        return "€450-599"
    elif eur_val < 800:
        return "€600-799"
    else:
        return "Over €800"


# -------------------------------
# Create new standardized columns
# -------------------------------

df["q15_lowest_price_eur"] = df[old_col].apply(parse_price_to_eur)

# Format EUR as decimal-comma style
df["q15_lowest_price_eur"] = df["q15_lowest_price_eur"].apply(
    lambda x: f"{x:.2f}".replace(".", ",") if pd.notna(x) else None
)

# Price range based on EUR numeric value
df["q15_lowest_price_range"] = df["q15_lowest_price_eur"].apply(
    lambda v: assign_price_range(float(v.replace(",", "."))) if v not in (None, "") else None
)


# -------------------------------
# Move new columns after q14_color
# -------------------------------

insert_pos = df.columns.get_loc("q14_color") + 1

for col in ["q15_lowest_price_range", "q15_lowest_price_eur"]:
    df.insert(insert_pos, col, df.pop(col))
    insert_pos += 1  # Insert next column right after the previous one


# -------------------------------
# Remove old price column
# -------------------------------
df.drop(columns=[old_col], inplace=True)


### Column 20: q18_smartphone_features

In [516]:
df = df_surveys_final
old_col = "q18_smartphone_features"   # <-- adjust if your column has a different name

def parse_feature_list(x):
    if pd.isna(x):
        return []
    try:
        lst = ast.literal_eval(x)
    except:
        # Fallback manual parsing
        x = x.strip().lstrip("[").rstrip("]")
        lst = [i.strip().strip('"').strip("'") for i in x.split(",") if i.strip()]

    # Capitalize first letter of each feature
    return [item.capitalize() for item in lst]


# -----------------------------
# Convert each row into a list
# -----------------------------
df["parsed_features"] = df[old_col].apply(parse_feature_list)

# -----------------------------
# Determine insertion location
# -----------------------------
insert_pos = df.columns.get_loc(old_col)

# -----------------------------
# Create new q18_feature_* columns (capitalized)
# -----------------------------
for i in range(1, 4):   # 3 features max
    df.insert(
        insert_pos + (i - 1),
        f"q18_feature_{i}",
        df["parsed_features"].apply(lambda lst: lst[i-1] if len(lst) >= i else None)
    )

# -----------------------------
# Remove old + helper column
# -----------------------------
df.drop(columns=[old_col, "parsed_features"], inplace=True)


### Column 46: q39_contradictory_handling

In [517]:
df = df_surveys_final
old_col = "q39_contradictory_handling"   # <-- rename to your actual column name

def parse_list(x):
    """Parse list-like strings; capitalize items."""
    if pd.isna(x):
        return []
    try:
        lst = ast.literal_eval(x)
    except:
        # fallback parser
        x = x.strip().lstrip("[").rstrip("]")
        lst = [i.strip().strip('"').strip("'") for i in x.split(",") if i.strip()]
    # Capitalize first letter of each item
    return [item.capitalize() for item in lst]


# -----------------------------
# Convert each row into a list
# -----------------------------
df["parsed_temp"] = df[old_col].apply(parse_list)

# -----------------------------
# Determine max number of items
# -----------------------------
max_len = df["parsed_temp"].apply(len).max()

# -----------------------------
# Determine insertion location
# -----------------------------
insert_pos = df.columns.get_loc(old_col)

# -----------------------------
# Create q39_contradiction_* columns
# -----------------------------
for i in range(1, max_len + 1):
    df.insert(
        insert_pos + (i - 1),
        f"q39_contradiction_{i}",
        df["parsed_temp"].apply(lambda lst: lst[i-1] if len(lst) >= i else None)
    )

# -----------------------------
# Remove old + temporary column
# -----------------------------
df.drop(columns=[old_col, "parsed_temp"], inplace=True)


In [518]:
def clean_age_group(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # Normalize using flexible pattern matching
    if re.search(r"under\s*18", text):
        return "Under 18"
    if re.search(r"18\s*-\s*24", text):
        return "18-24"
    if re.search(r"25\s*-\s*34", text):
        return "25-34"
    if re.search(r"35\s*-\s*44", text):
        return "35-44"
    if re.search(r"45\s*-\s*54", text):
        return "45-54"
    if re.search(r"55", text):
        return "55 and above"

    return None  # unexpected values → blank


# ✔ Apply directly to df_surveys_final
df_surveys_final["q1_age_group"] = df_surveys_final["q1_age_group"].apply(clean_age_group)


In [519]:
def clean_gender(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # Female variants
    if text in ["female", "f", "woman", "female ", "feminine"]:
        return "Female"

    # Male variants
    if text in ["male", "m", "man", "masculine"]:
        return "Male"

    # Non-binary / diverse variants
    if any(x in text for x in [
        "non-binary", "nonbinary", "diverse", "nb", "non binary", "genderqueer"
    ]):
        return "Non-binary / Diverse"

    # Prefer not to say
    if "prefer" in text or "not to say" in text or "no answer" in text:
        return "Prefer not to say"

    # If unclear → default to Prefer not to say
    return "Prefer not to say"


# ✔ Apply directly to df_surveys_final
df_surveys_final["q2_gender"] = df_surveys_final["q2_gender"].apply(clean_gender)


In [520]:
def clean_education(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # High school or below
    if any(x in text for x in ["high school", "secondary", "school", "highschool", "gymnasium"]):
        return "High school or below"

    # Bachelor's degree
    if any(x in text for x in ["bachelor", "ba", "b.sc", "bsc", "undergraduate"]):
        return "Bachelor's degree"

    # Master's degree
    if any(x in text for x in ["master", "msc", "m.sc", "postgraduate"]):
        return "Master's degree"

    # Doctorate / PhD
    if any(x in text for x in ["phd", "doctorate", "dr.", "doctoral"]):
        return "Doctorate / PhD"

    # If unclear → blank (or set to a default)
    return None


# ✔ Apply directly to df_surveys_final
df_surveys_final["q3_education"] = df_surveys_final["q3_education"].apply(clean_education)

In [521]:
def clean_employment_status(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # Student
    if any(x in text for x in ["student", "studying", "pupil"]):
        return "Student"

    # Employed (full-time or part-time)
    if any(x in text for x in ["employed", "working", "full-time", "part-time", "employee", "work"]):
        return "Employed"

    # Self-employed
    if any(x in text for x in ["self-employed", "self employed", "entrepreneur", "freelance"]):
        return "Self-employed"

    # Unemployed
    if any(x in text for x in ["unemployed", "jobless", "not working"]):
        return "Unemployed"

    # Everything else → Other
    return "Other"


# ✔ Apply directly to df_surveys_final
df_surveys_final["q4_employment_status"] = df_surveys_final["q4_employment_status"].apply(clean_employment_status)


In [522]:
def clean_ai_usage(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # --- 0 times ---
    if re.search(r"\b0\b", text) or "0 times" in text:
        return "0 times"

    # --- 1–2 times ---
    if re.search(r"1\s*[-–]\s*2", text) or text in ["1", "2"]:
        return "1-2 times"

    # --- 3–5 times ---
    if re.search(r"3\s*[-–]\s*5", text) or text in ["3", "4", "5"]:
        return "3-5 times"

    # --- 6–10 times ---
    if re.search(r"6\s*[-–]\s*10", text) or text in ["6", "7", "8", "9", "10"]:
        return "6-10 times"

    # --- More than 10 times ---
    if any(x in text for x in ["more than", "over", "above", "10+", "11", "12", "13", "14", "15"]):
        return "More than 10 times"

    return None


# ✔ Apply directly to df_surveys_final
df_surveys_final["q9_ai_usage_frequency"] = df_surveys_final["q9_ai_usage_frequency"].apply(clean_ai_usage)


In [523]:
def clean_q11(val):
    if pd.isna(val):
        return None

    text = str(val).strip().lower()

    # Normalization
    text = text.replace("–", "-").replace("—", "-").replace("€", "")

    # ----- Under €150 -----
    if "under" in text or "<150" in text or "below" in text or "under 150" in text:
        return "Under €150"

    # ----- €150–299 -----
    if "150-299" in text or ("150" in text and "299" in text):
        return "€150-299"

    # ----- €300–449 -----
    if "300-449" in text or ("300" in text and "449" in text):
        return "€300-449"

    # ----- €450–599 -----
    if "450-599" in text or ("450" in text and "599" in text):
        return "€450-599"

    # ----- €600–799 -----
    if "600-799" in text or ("600" in text and "799" in text):
        return "€600-799"

    # ----- Over €800 -----
    if "over" in text or "above" in text or "800" in text or ">800" in text:
        return "Over €800"

    # ----- Not sure -----
    if "not sure" in text or "unsure" in text or "dont know" in text:
        return "Not Sure"

    return None


# ✔ Apply directly to df_surveys_final
df_surveys_final["q11_response"] = df_surveys_final["q11_response"].apply(clean_q11)


In [524]:
# Mapping from short code → full descriptive text
contradiction_map = {
    "First_result": "I trusted the first result I clicked on",
    "Additional_sources": "I searched for and compared additional sources",
    "Most_detailed": "I chose the result that seemed most detailed or complete",
    "Own_judgment": "I relied on my own knowledge or judgment",
    "No_contradictions": "I did not find any contradictions",
    "Other": "Other",
    None: None
}

# Apply mapping to all q39_contradiction_* columns
for col in ["q39_contradiction_1", "q39_contradiction_2", "q39_contradiction_3"]:
    df_surveys_final[col] = df_surveys_final[col].map(contradiction_map)


In [None]:
rename_dict = {
    "q4_employment_status": "q4_employment",
    "q6_country_residence": "q6_residence",
    "q7_ai_familiarity": "q7_chatbot_familiarity",
    "q8_attention_check": "q8_data_quality",
    "q9_ai_usage_frequency": "q9_chatbot_usage",

    # q10 tools renamed as group (q10_search_tools_1..4)
    "q10_tool_1": "q10_search_tools_1",
    "q10_tool_2": "q10_search_tools_2",
    "q10_tool_3": "q10_search_tools_3",
    "q10_tool_4": "q10_search_tools_4",

    "q11_response": "q11_budget",
    "q12_smartphone_model": "q12_brand_model",
    "q12_brand_only": "q12_brand_only",    # keep as is, optional

    "q13_storage_capacity": "q13_storage",

    # price columns
    "q15_lowest_price_eur": "q15_price_eur",
    "q15_lowest_price_range": "q15_price_range",

    "q16_website_link": "q16_website",

    # q18 features → important features
    "q18_feature_1": "q18_important_features_1",
    "q18_feature_2": "q18_important_features_2",
    "q18_feature_3": "q18_important_features_3",

    "q38_attention_check": "q38_attention",

    "q41_time_spent": "q41_duration",
    "q42_future_usage_feedback": "q42_comments",

    # q39 multi-columns → contradictory_info
    "q39_contradiction_1": "q39_contradictory_info_1",
    "q39_contradiction_2": "q39_contradictory_info_2",
    "q39_contradiction_3": "q39_contradictory_info_3"
}

df_surveys_final = df_surveys_final.rename(columns=rename_dict)

In [529]:
# Columns to remove
cols_to_drop = [
    "created_at",
    "session_id_bg_inst_search",
    "session_id_postsurvey",
    "record_created_at"
]

df_surveys_final = df_surveys_final.drop(
    columns=[c for c in cols_to_drop if c in df_surveys_final.columns],
    errors="ignore"
)

# Extract all q-columns
q_cols = [col for col in df_surveys_final.columns if col.startswith("q")]

# Sort q-columns by their question number (q1 → q42)
q_cols_sorted = sorted(q_cols, key=lambda x: int(x.split("_")[0][1:]))

# Move q1 to immediately after participant_id
if "q1_age_group" in q_cols_sorted:
    q_cols_sorted.remove("q1_age_group")
    q_cols_sorted.insert(0, "q1_age_group")

# Final column order
ordered_cols = (
    ["participant_id"] +
    q_cols_sorted +
    [
        "background_submitted_at",
        "instruction_submitted_at",
        "postsurvey_submitted_at",
        "ip_address",
        "device_type",
        "results_submitted_at",
        "session_start_time",
        "session_end_time",
        "session_duration_ms"
    ]
)

# Keep only existing columns
ordered_cols = [c for c in ordered_cols if c in df_surveys_final.columns]

# Apply final ordering
df_surveys_final = df_surveys_final[ordered_cols]


In [530]:
df_surveys_final.columns

Index(['participant_id', 'q1_age_group', 'q2_gender', 'q3_education',
       'q4_employment', 'q5_nationality', 'q6_residence',
       'q7_chatbot_familiarity', 'q8_data_quality', 'q9_chatbot_usage',
       'q10_search_tools_1', 'q10_search_tools_2', 'q10_search_tools_3',
       'q10_search_tools_4', 'q11_budget', 'q12_brand_model', 'q12_brand_only',
       'q13_storage', 'q14_color', 'q14_color_1', 'q15_price_range',
       'q15_price_eur', 'q16_website', 'q17_price_importance',
       'q18_important_features_1', 'q18_important_features_2',
       'q18_important_features_3', 'q19_task_easy', 'q20_task_quick',
       'q21_task_familiar', 'q22_tool_reliable', 'q23_tool_practical',
       'q24_tool_like', 'q25_tool_easy_use', 'q26_tool_clear_interaction',
       'q27_tool_control', 'q28_tool_provides_info', 'q29_tool_helps_complete',
       'q30_tool_useful', 'q31_tool_too_much_info', 'q32_tool_hard_focus',
       'q33_results_accurate', 'q34_results_trustworthy',
       'q35_results_com

In [531]:
df_surveys_final

Unnamed: 0,participant_id,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,q6_residence,q7_chatbot_familiarity,q8_data_quality,q9_chatbot_usage,...,q42_comments,background_submitted_at,instruction_submitted_at,postsurvey_submitted_at,ip_address,device_type,results_submitted_at,session_start_time,session_end_time,session_duration_ms
0,09e6255b-ed66-4eea-a314-104a08130ac0,35-44,Female,Doctorate / PhD,Employed,UK,UK,7,1,,...,"I liked the more minimal interface, and I like...",2025-10-31 22:11:56.864775+00:00,2025-10-31 22:12:41.977243+00:00,2025-10-31 22:22:26.969691+00:00,129.67.117.187,desktop,2025-10-31 22:19:44.042285+00:00,2025-10-31 22:10:58.834+00,2025-10-31 22:18:19.43+00,440596.0
1,15ef74b6-a61a-474c-b855-696b20ce58fb,55 and above,Male,Master's degree,Employed,Germany,Germany,1,1,0 times,...,Maybe,2025-10-24 14:12:35.527412+00:00,2025-10-24 14:21:30.862457+00:00,2025-10-24 14:28:43.167002+00:00,213.146.69.174,desktop,2025-10-24 14:27:11.483904+00:00,2025-10-24 14:11:51.631+00,2025-10-24 14:24:48.07+00,776439.0
2,1f0df1be-a1ea-4080-90c3-230fe9e35174,18-24,Female,Bachelor's degree,Student,USA,USA,6,1,,...,"Yes, it showed me relevant results for what I ...",2025-11-10 03:25:51.855943+00:00,2025-11-10 03:26:25.769685+00:00,2025-11-10 03:29:58.830293+00:00,108.2.105.241,desktop,2025-11-10 03:28:20.935272+00:00,2025-11-10 03:25:14.283+00,2025-11-10 03:26:06.698+00,52415.0
3,2afa9961-1844-49e8-80fc-444466532f46,25-34,Female,Master's degree,Employed,,Russia,6,6,0 times,...,"Yes, the tool was transparent and easy to use,...",2025-10-24 17:08:04.105300+00:00,2025-10-24 17:10:42.739044+00:00,2025-10-24 17:15:27.907607+00:00,2a00:1370:8180:f390:10f5:bcf9:8f0:be3f,mobile,2025-10-24 17:12:41.450159+00:00,2025-10-24 17:06:39.799+00,2025-10-24 17:10:49.029+00,249230.0
4,2d8dd1db-9d38-49e0-bf03-5b4735523d27,18-24,Female,High school or below,Student,Switzerland,Switzerland,7,4,,...,"No, AI didn’t show the classical bar abovd",2025-10-29 09:54:21.050268+00:00,2025-10-29 09:54:41.702835+00:00,2025-10-29 10:01:23.058400+00:00,172.225.188.246,mobile,2025-10-29 09:58:09.817211+00:00,2025-10-29 09:52:50.793+00,2025-10-29 09:54:46.465+00,115672.0
5,2e912156-c7b7-4268-8420-128a859c4876,25-34,Female,Master's degree,Employed,Germany,Germany,3,3,0 times,...,"Yes, easy to use",2025-10-26 15:45:12.277452+00:00,2025-10-26 15:45:16.132619+00:00,2025-10-26 15:48:06.054724+00:00,84.176.238.16,mobile,2025-10-26 15:46:09.863472+00:00,2025-10-26 15:44:29.794+00,2025-10-26 15:45:39.02+00,69226.0
6,3409f5be-93f2-44f4-8edb-910e95126257,25-34,Female,Bachelor's degree,Employed,UK,UK,5,1,1-2 times,...,Its easy to use,2025-11-10 07:23:32.649361+00:00,2025-11-10 07:23:42.671639+00:00,2025-11-10 07:25:31.551284+00:00,82.132.245.134,mobile,2025-11-10 07:24:45.804281+00:00,2025-11-10 07:22:57.74+00,2025-11-10 07:23:56.719+00,58979.0
7,35f58cd5-b9a5-4132-a94d-8fcad2800a59,35-44,Female,Bachelor's degree,Student,Kazakhstan,Germany,6,1,,...,I don’t think so. It is not visually appealing,2025-10-24 19:13:03.589983+00:00,2025-10-24 19:13:54.201872+00:00,2025-10-24 19:22:36.379595+00:00,46.5.2.111,mobile,2025-10-24 19:19:39.938857+00:00,2025-10-24 19:11:36.971+00,2025-10-24 19:14:27.424+00,170453.0
8,3beee5db-499b-4741-b3ea-72c6f17ffb86,25-34,Female,Bachelor's degree,Employed,Vietnam,Vietnam,6,1,,...,My experience with this version of Google Sear...,2025-10-24 08:55:18.954744+00:00,2025-10-24 08:59:09.367312+00:00,2025-10-24 09:19:59.596176+00:00,104.30.161.158,desktop,2025-10-24 09:04:36.488336+00:00,2025-10-24 08:47:58.199+00,2025-10-24 09:02:52.01+00,893811.0
9,403feef5-647d-4a34-a15a-c4bdc29fb2c2,18-24,Female,High school or below,Student,Latvia,Latvia,5,1,0 times,...,"Yes, it works well for me",2025-11-09 14:56:32.243330+00:00,2025-11-09 14:57:54.747882+00:00,2025-11-09 15:30:19.979275+00:00,77.38.136.74,desktop,2025-11-09 15:16:04.986421+00:00,2025-11-09 14:41:44.625+00,2025-11-09 14:58:58.615+00,1033990.0
