# Environment Setup

In [1]:
# Debugging: Use to toggle on/off print statements for debugging.
# True: debug_messages will print 
# False: debug_messages will not print.
DEBUG = False
PACKAGES_INSTALLED = True # Set to True once packages are installed so they don't rerun.

def debug_print(*args, **kwargs):
# print(message)
    if DEBUG:
        print(*args, **kwargs)

def debug_display(df, name=None):
# display(df)
    if DEBUG:
        if name:
            print(f"Displaying DataFrame: {name}")
        display(df)

# When working on a code cell use normal print() / display statements. After you confirm the code is correct, change to the debug versions.

In [2]:
# Install packages

if not PACKAGES_INSTALLED:
    # Install packages
    %pip install -r requirements.txt
    debug_print("Packages installed.")
    PACKAGES_INSTALLED = True
else:
    debug_print("Packages already installed. Skipping installation.")


In [3]:
# Update packages if needed
# %pip freeze > requirements.txt

In [4]:
# Import packages + load dataframe
import pandas as pd
import numpy as np

# Load CSV file into DataFrame
df = pd.read_csv('data.csv')


  df = pd.read_csv('data.csv')


# Data Exploration

In [5]:
# Analyze dataframe

debug_print(list(df.columns))

In [6]:
# View ranges of data in each column


for col in df.columns:
    unique_values = df[col].dropna().unique()  # Drop NaNs if you don’t want them in the list
    unique_values_list = [float(val) if isinstance(val, (float, int)) else val for val in unique_values]
    debug_print(f"{col}: {unique_values_list}")



# Data Cleaning / Preprocessing

### 1. Drop rows that aren't questions, or questions that have text values.

In [7]:
columns_to_keep = [col for col in df.columns if ('Q' in col or 'QN' in col) and 'TEXT' not in col]
columns_to_drop = [col for col in df.columns if col not in columns_to_keep]
df = df[columns_to_keep]

# Print dropped columns
debug_print("Dropped columns:", columns_to_drop)

### 2. Drop all QN columns

In [8]:
columns_to_drop = [col for col in df.columns if col.startswith("QN")]
columns_to_keep = [col for col in df.columns if col.startswith("Q") and not col.startswith("QN")]
df_old = df[columns_to_drop]
df = df[columns_to_keep]

debug_print("Dropped columns:", columns_to_drop)
debug_print("\n")
debug_print("Kept columns:", columns_to_keep)


In [9]:
df.shape

(22069, 687)

### 3. Merge all multi-choice questions into one column.

In [10]:
# 3: Question 4
# N -> Not Answered
# Z -> Not Displayed
# S -> Skipped
new_column = 'Q4'
columns_to_merge = ['Q4a', 'Q4b', 'Q4c', 'Q4d', 'Q4e']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [11]:
# 3: Question 5

new_column = 'Q5'
columns_to_merge = ['Q5a', 'Q5b', 'Q5c', 'Q5d', 'Q5e']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [12]:
# 3: Question 11

new_column = 'Q11'
columns_to_merge = ['Q11a', 'Q11b', 'Q11c', 'Q11d', 'Q11e', 'Q11f', 'Q11g', 'Q11h', 'Q11i', 'Q11j', 'Q11k', 'Q11l', 'Q11m', 'Q11n']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)


In [13]:
# 3: Question 12

new_column = 'Q12'
columns_to_merge = ['Q12a', 'Q12b', 'Q12c', 'Q12d', 'Q12e', 'Q12f', 'Q12g', 'Q12h', 'Q12i', 'Q12j', 'Q12k', 'Q12l', 'Q12m', 'Q12n']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [14]:
debug_display(df['Q12'].head(10))

In [15]:
# 3: Question 14

new_column = 'Q14'
columns_to_merge = ['Q14a', 'Q14b', 'Q14c', 'Q14d', 'Q14e', 'Q14f', 'Q14g', 'Q14h',
                    'Q14i', 'Q14j', 'Q14k', 'Q14l', 'Q14m', 'Q14n', 'Q14o', 'Q14p']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

KeyboardInterrupt: 

In [None]:
# 3: Question 18a

new_column = 'Q18a'
columns_to_merge = [
    'Q18a_a', 'Q18a_b', 'Q18a_c', 'Q18a_d', 'Q18a_e', 'Q18a_f', 'Q18a_g', 'Q18a_h', 'Q18a_i', 'Q18a_j', 'Q18a_k'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18b

new_column = 'Q18b'
columns_to_merge = [
    'Q18b_a', 'Q18b_b', 'Q18b_c', 'Q18b_d', 'Q18b_e', 'Q18b_f', 'Q18b_g', 'Q18b_h', 'Q18b_i', 'Q18b_j',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18c

new_column = 'Q18c'
columns_to_merge = [
    'Q18c_a', 'Q18c_b', 'Q18c_c', 'Q18c_d', 'Q18c_e', 'Q18c_f', 'Q18c_g', 'Q18c_h', 'Q18c_i', 'Q18c_j',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18d

new_column = 'Q18d'
columns_to_merge = [
    'Q18d_a', 'Q18d_b', 'Q18d_c', 'Q18d_d', 'Q18d_e', 'Q18d_f', 'Q18d_g', 'Q18d_h', 'Q18d_i', 'Q18d_j',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18e

new_column = 'Q18e'
columns_to_merge = [
    'Q18e_a', 'Q18e_b', 'Q18e_c', 'Q18e_d', 'Q18e_e', 'Q18e_f', 'Q18e_g', 'Q18e_h', 'Q18e_i', 'Q18e_j', 'Q18e_k',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18f

new_column = 'Q18f'
columns_to_merge = [
    'Q18f_a', 'Q18f_b', 'Q18f_c', 'Q18f_d', 'Q18f_e', 'Q18f_f', 'Q18f_g', 'Q18f_h', 'Q18f_i', 'Q18f_j', 'Q18f_k',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18g

new_column = 'Q18g'
columns_to_merge = [
    'Q18g_a', 'Q18g_b', 'Q18g_c', 'Q18g_d', 'Q18g_e', 'Q18g_f', 'Q18g_g', 'Q18g_h', 'Q18g_i', 'Q18g_j',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18h

new_column = 'Q18h'
columns_to_merge = [
    'Q18h_a', 'Q18h_b', 'Q18h_c', 'Q18h_d', 'Q18h_e', 'Q18h_f', 'Q18h_g', 'Q18h_h', 'Q18h_i', 'Q18h_j', 'Q18h_k',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18i

new_column = 'Q18i'
columns_to_merge = [
    'Q18i_a', 'Q18i_b', 'Q18i_c', 'Q18i_d', 'Q18i_e', 'Q18i_f', 'Q18i_g', 'Q18i_h', 'Q18i_i', 'Q18i_j',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18j

new_column = 'Q18j'
columns_to_merge = [
    'Q18j_a', 'Q18j_b', 'Q18j_c', 'Q18j_d', 'Q18j_e', 'Q18j_f', 'Q18j_g', 'Q18j_h', 'Q18j_i', 'Q18j_j',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 18k

new_column = 'Q18k'
columns_to_merge = [
    'Q18k_a', 'Q18k_b', 'Q18k_c', 'Q18k_d', 'Q18k_e', 'Q18k_f', 'Q18k_g', 'Q18k_h', 'Q18k_i', 'Q18k_j',
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 21a-21l (combined for convenience)

new_columns = ['Q21a', 'Q21b', 'Q21c', 'Q21d', 'Q21e', 'Q21f', 'Q21g', 'Q21h', 'Q21i', 'Q21j', 'Q21k', 'Q21l']
columns_to_merge = [
    ['Q21a_a', 'Q21a_b', 'Q21a_c', 'Q21a_d', 'Q21a_e', 'Q21a_f', 'Q21a_g', 'Q21a_h'],
    ['Q21b_a', 'Q21b_b', 'Q21b_c', 'Q21b_d', 'Q21b_e', 'Q21b_f', 'Q21b_g', 'Q21b_h'],
    ['Q21c_a', 'Q21c_b', 'Q21c_c', 'Q21c_d', 'Q21c_e', 'Q21c_f', 'Q21c_g', 'Q21c_h'],
    ['Q21d_a', 'Q21d_b', 'Q21d_c', 'Q21d_d', 'Q21d_e', 'Q21d_f', 'Q21d_g', 'Q21d_h'],
    ['Q21e_a', 'Q21e_b', 'Q21e_c', 'Q21e_d', 'Q21e_e', 'Q21e_f', 'Q21e_g', 'Q21e_h'],
    ['Q21f_a', 'Q21f_b', 'Q21f_c', 'Q21f_d', 'Q21f_e', 'Q21f_f', 'Q21f_g', 'Q21f_h'],
    ['Q21g_a', 'Q21g_b', 'Q21g_c', 'Q21g_d', 'Q21g_e', 'Q21g_f', 'Q21g_g', 'Q21g_h'],
    ['Q21h_a', 'Q21h_b', 'Q21h_c', 'Q21h_d', 'Q21h_e', 'Q21h_f', 'Q21h_g', 'Q21h_h'],
    ['Q21i_a', 'Q21i_b', 'Q21i_c', 'Q21i_d', 'Q21i_e', 'Q21i_f', 'Q21i_g', 'Q21i_h'],
    ['Q21j_a', 'Q21j_b', 'Q21j_c', 'Q21j_d', 'Q21j_e', 'Q21j_f', 'Q21j_g', 'Q21j_h'],
    ['Q21k_a', 'Q21k_b', 'Q21k_c', 'Q21k_d', 'Q21k_e', 'Q21k_f', 'Q21k_g', 'Q21k_h'],
    ['Q21l_a', 'Q21l_b', 'Q21l_c', 'Q21l_d', 'Q21l_e', 'Q21l_f', 'Q21l_g', 'Q21l_h']
]

for new_column, merge_columns in zip(new_columns, columns_to_merge):
    df[new_column] = np.nan

    # Flatten value counts for individual columns in the current group
    original_column_values = pd.concat(
        [df[col].value_counts(dropna=False).rename(f"{col}") for col in merge_columns],
        axis=1
    ).reset_index().rename(columns={'index': 'Vals'})

    for index, row in df.iterrows():
        selected_options = []
        if any(row[col] == 'S' for col in merge_columns):
            df.at[index, new_column] = 'S'
        elif any(row[col] == 'N' for col in merge_columns):
            df.at[index, new_column] = 'N'
        elif any(row[col] == 'Z' for col in merge_columns):
            df.at[index, new_column] = 'Z'
        else:
            for col in merge_columns:
                try:
                    if float(row[col]) == 1.0:
                        selected_options.append(col)
                except (ValueError, TypeError):
                    continue
            if selected_options:
                df.at[index, new_column] = ','.join(selected_options)

    df = df.drop(columns=merge_columns)

    new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
    new_column_values.columns = ['Vals', f'{new_column}']

    debug_display(original_column_values)
    debug_display(new_column_values)


In [None]:
# 3: Question 22a-22l (combined for convenience)

new_columns = ['Q22a', 'Q22b', 'Q22c', 'Q22d', 'Q22e', 'Q22f', 'Q22g', 'Q22h', 'Q22i', 'Q22j', 'Q22k', 'Q22l']
columns_to_merge = [
    ['Q22a_a', 'Q22a_b', 'Q22a_c', 'Q22a_d', 'Q22a_e', 'Q22a_f', 'Q22a_g', 'Q22a_h', 'Q22a_i', 'Q22a_j', 'Q22a_k', 'Q22a_l'],
    ['Q22b_a', 'Q22b_b', 'Q22b_c', 'Q22b_d', 'Q22b_e', 'Q22b_f', 'Q22b_g', 'Q22b_h', 'Q22b_i', 'Q22b_j', 'Q22b_k', 'Q22b_l'],
    ['Q22c_a', 'Q22c_b', 'Q22c_c', 'Q22c_d', 'Q22c_e', 'Q22c_f', 'Q22c_g', 'Q22c_h', 'Q22c_i', 'Q22c_j', 'Q22c_k', 'Q22c_l'],
    ['Q22d_a', 'Q22d_b', 'Q22d_c', 'Q22d_d', 'Q22d_e', 'Q22d_f', 'Q22d_g', 'Q22d_h', 'Q22d_i', 'Q22d_j', 'Q22d_k', 'Q22d_l'],
    ['Q22e_a', 'Q22e_b', 'Q22e_c', 'Q22e_d', 'Q22e_e', 'Q22e_f', 'Q22e_g', 'Q22e_h', 'Q22e_i', 'Q22e_j', 'Q22e_k', 'Q22e_l'],
    ['Q22f_a', 'Q22f_b', 'Q22f_c', 'Q22f_d', 'Q22f_e', 'Q22f_f', 'Q22f_g', 'Q22f_h', 'Q22f_i', 'Q22f_j', 'Q22f_k', 'Q22f_l'],
    ['Q22g_a', 'Q22g_b', 'Q22g_c', 'Q22g_d', 'Q22g_e', 'Q22g_f', 'Q22g_g', 'Q22g_h', 'Q22g_i', 'Q22g_j', 'Q22g_k', 'Q22g_l'],
    ['Q22h_a', 'Q22h_b', 'Q22h_c', 'Q22h_d', 'Q22h_e', 'Q22h_f', 'Q22h_g', 'Q22h_h', 'Q22h_i', 'Q22h_j', 'Q22h_k', 'Q22h_l'],
    ['Q22i_a', 'Q22i_b', 'Q22i_c', 'Q22i_d', 'Q22i_e', 'Q22i_f', 'Q22i_g', 'Q22i_h', 'Q22i_i', 'Q22i_j', 'Q22i_k', 'Q22i_l'],
    ['Q22j_a', 'Q22j_b', 'Q22j_c', 'Q22j_d', 'Q22j_e', 'Q22j_f', 'Q22j_g', 'Q22j_h', 'Q22j_i', 'Q22j_j', 'Q22j_k', 'Q22j_l'],
    ['Q22k_a', 'Q22k_b', 'Q22k_c', 'Q22k_d', 'Q22k_e', 'Q22k_f', 'Q22k_g', 'Q22k_h', 'Q22k_i', 'Q22k_j', 'Q22k_k', 'Q22k_l'],
    ['Q22l_a', 'Q22l_b', 'Q22l_c', 'Q22l_d', 'Q22l_e', 'Q22l_f', 'Q22l_g', 'Q22l_h', 'Q22l_i', 'Q22l_j', 'Q22l_k', 'Q22l_l']
]

for new_column, merge_columns in zip(new_columns, columns_to_merge):
    df[new_column] = np.nan

    # Flatten value counts for individual columns in the current group
    original_column_values = pd.concat(
        [df[col].value_counts(dropna=False).rename(f"{col}") for col in merge_columns],
        axis=1
    ).reset_index().rename(columns={'index': 'Vals'})

    for index, row in df.iterrows():
        selected_options = []
        if any(row[col] == 'S' for col in merge_columns):
            df.at[index, new_column] = 'S'
        elif any(row[col] == 'N' for col in merge_columns):
            df.at[index, new_column] = 'N'
        elif any(row[col] == 'Z' for col in merge_columns):
            df.at[index, new_column] = 'Z'
        else:
            for col in merge_columns:
                try:
                    if float(row[col]) == 1.0:
                        selected_options.append(col)
                except (ValueError, TypeError):
                    continue
            if selected_options:
                df.at[index, new_column] = ','.join(selected_options)

    df = df.drop(columns=merge_columns)

    new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
    new_column_values.columns = ['Vals', f'{new_column}']

    debug_display(original_column_values)
    debug_display(new_column_values)

In [None]:
# 3: Question 24

new_column = 'Q24'
columns_to_merge = ['Q24a', 'Q24b', 'Q24c', 'Q24d']


df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 29

new_column = 'Q29'
columns_to_merge = ['Q29a', 'Q29b', 'Q29c', 'Q29d', 'Q29e', 'Q29f', 'Q29g', 'Q29h', 'Q29i', 'Q29j']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 34

new_column = 'Q34'
columns_to_merge = ['Q34a', 'Q34b', 'Q34c', 'Q34d']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)


In [None]:
# 3: Question 35

new_column = 'Q35'
columns_to_merge = ['Q35a', 'Q35b', 'Q35c', 'Q35d']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 42

new_column = 'Q42'
columns_to_merge = ['Q42a', 'Q42b', 'Q42c', 'Q42d', 'Q42e', 'Q42f', 'Q42g', 'Q42h', 'Q42i', 'Q42j', 'Q42k', 'Q42l']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 56

new_column = 'Q56'
columns_to_merge = ['Q56a', 'Q56b', 'Q56c', 'Q56d']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 57

new_column = 'Q57'
columns_to_merge = [
    'Q57a', 'Q57b', 'Q57c', 'Q57d', 'Q57e', 'Q57f', 'Q57g', 'Q57h', 
    'Q57i', 'Q57j', 'Q57k', 'Q57l', 'Q57m', 'Q57n'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 70

new_column = 'Q70'
columns_to_merge = [
    'Q70a', 'Q70b', 'Q70c', 'Q70d', 'Q70e', 'Q70f', 'Q70g', 'Q70h', 
    'Q70i', 'Q70j', 'Q70k', 'Q70l'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 79

new_column = 'Q79'
columns_to_merge = [
    'Q79a', 'Q79b', 'Q79c', 'Q79d', 'Q79e', 'Q79f', 'Q79g', 'Q79h', 'Q79i'
]


df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 88

new_column = 'Q88'
columns_to_merge = ['Q88a', 'Q88b', 'Q88c', 'Q88d', 'Q88e', 'Q88f']

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 123

new_column = 'Q123'
columns_to_merge = [
    'Q123a', 'Q123b', 'Q123c', 'Q123d', 'Q123e', 'Q123f', 'Q123g', 'Q123h', 'Q123i'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 126

new_column = 'Q126'
columns_to_merge = [
    'Q126a', 'Q126b', 'Q126c', 'Q126d', 'Q126e', 'Q126f', 'Q126g'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 135

new_column = 'Q135'
columns_to_merge = [
    'Q135a', 'Q135b', 'Q135c', 'Q135d', 'Q135e', 'Q135f'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 136

new_column = 'Q136'
columns_to_merge = [
    'Q136a', 'Q136b', 'Q136c', 'Q136d', 'Q136e', 'Q136f', 'Q136g', 'Q136h', 
    'Q136i', 'Q136j', 'Q136k', 'Q136l', 'Q136m'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

In [None]:
# 3: Question 137

new_column = 'Q137'
columns_to_merge = [
    'Q137a', 'Q137b', 'Q137c', 'Q137d', 'Q137e', 'Q137f', 'Q137g', 'Q137h', 
    'Q137i', 'Q137j', 'Q137k', 'Q137l', 'Q137m', 'Q137n', 'Q137o'
]

df[new_column] = np.nan

original_column_values = pd.concat([df[col].value_counts(dropna=False).rename(col) for col in columns_to_merge], axis=1)
original_column_values = original_column_values.reset_index().rename(columns={'index': 'Vals'})

for index, row in df.iterrows():
    selected_options = []
    if any(row[col] == 'S' for col in columns_to_merge):
        df.at[index, new_column] = 'S'
    elif any(row[col] == 'N' for col in columns_to_merge):
        df.at[index, new_column] = 'N'
    elif any(row[col] == 'Z' for col in columns_to_merge):
        df.at[index, new_column] = 'Z'
    else:
        for col in columns_to_merge:
            try:
                if float(row[col]) == 1.0 or float(row[col]) == 2.0:
                    selected_options.append(col)
            except (ValueError, TypeError):
                continue
        if selected_options:
            df.at[index, new_column] = ','.join(selected_options)

df = df.drop(columns=columns_to_merge)

new_column_values = pd.DataFrame(df[new_column].value_counts(dropna=False)).reset_index()
new_column_values.columns = ['Vals', f'{new_column}']

debug_display(original_column_values)
debug_display(new_column_values)

Export a smaller set for better view and chatgpt use

In [None]:
# Select the first 200 rows
# subset_df = df.head(200)

# Save the subset to a new CSV file
# subset_df.to_csv('subset_data.csv', index=False)

### 4. Create Target Feature Q_target 
1 : User     
2 : Non-User 

In [None]:
df['Q_target'] = df['Q100'].apply(
    lambda x: 1 if (pd.to_numeric(x, errors='coerce') in range(1, 31) or x == 'N') else (2 if x == 'S' else None)
)

df = df[~df['Q100'].isin(['Z', 'E'])]
df = df.dropna(subset=['Q_target'])
df = df.drop(columns=['Q100'])

In [None]:
q_target_counts = df['Q_target'].value_counts()
debug_print(f"Counts in Q_target:\n{q_target_counts}")
debug_print(f"Remaining rows in the DataFrame: {len(df)}")

### 4. Handle Missing Values

In [None]:
# Impute all true missing values (NaN) with the mode of the respective column
missing_counts = df.isna().sum()

for column, count in missing_counts.items():
    if count > 0:
        debug_print(f"Column {column}: {count} missing values")

for col in df.columns:
    df[col] = df[col].fillna(df[col].mode()[0])

missing_counts_after_imputation = df.isna().sum()

debug_print(f"Remaining missing values after imputation:\n{missing_counts_after_imputation}")
debug_print(f"Remaining rows in the DataFrame: {len(df)}")


In [None]:
# Replace all S values with 2 => "no".

s_counts = df.apply(lambda col: col[col == 'S'].count())
for column, count in s_counts.items():
    if count > 0:
        debug_print(f"{column}: {count}")
df = df.replace('S', '2')


In [None]:
# Impute N values with the mode of the respective columns.
rows_with_n_before = df.isin(['N']).any(axis=1).sum()
debug_print(f"Rows with 'N' before replacement: {rows_with_n_before}")

for column in df.columns:
    if df[column].isin(['N']).any():
        mode_value = df[column][df[column] != 'N'].mode()[0]
        df[column] = df[column].replace('N', mode_value)

rows_with_n_after = df.isin(['N']).any(axis=1).sum()
debug_print(f"Rows with 'N' after replacement: {rows_with_n_after}")

### 5. Handle Mixed Data Types


In [None]:
# Turn all types to str
df = df.astype(str)

def display_column_ranges(dataframe):
    for column in dataframe.columns:
        unique_values = dataframe[column].unique()
        try:
            sorted_values = sorted(unique_values)
        except TypeError:
            sorted_values = unique_values  # If sorting fails, leave as is
        debug_print(f"Column '{column}' has {len(unique_values)} unique values: {sorted_values}")

display_column_ranges(df)

# Data Encoding

### 1. Define Nominal Categorical Columns to Be Encoded

In [None]:
# Define nominal categorial columns to be encoded

TARGET_COLUMNS = [
    "Q2", "Q4", "Q5", "Q11", "Q12", "Q14", "Q15",
    "Q18a", "Q18b", "Q18c", "Q18d", "Q18e", "Q18f", "Q18g", "Q18h", "Q18i", "Q18j", "Q18k",
    "Q21a", "Q21b", "Q21c", "Q21d", "Q21e", "Q21f", "Q21g", "Q21h", "Q21i", "Q21j", "Q21k", "Q21l",
    "Q22a", "Q22b", "Q22c", "Q22d", "Q22e", "Q22f", "Q22g", "Q22h", "Q22i", "Q22j", "Q22k", "Q22l",
    "Q24", "Q29", "Q34", "Q42", "Q43", "Q56", "Q57", "Q58", "Q70", "Q79", "Q88", "Q123", "Q125", "Q135"
]

# for col in TARGET_COLUMNS:
#   df[col] = df[col].astype("category")

debug_print(df.dtypes[TARGET_COLUMNS])

### 2. Convert string value responses into lists for compatibility with encoder
("Q4a,Q4b") => ["Q4a","Q4b"]

In [None]:
# Split delimited strings into lists for compatibility with encoder.
existing_columns = [col for col in TARGET_COLUMNS if col in df.columns]
for col in existing_columns:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 and isinstance(x[0], list) else (x.split(',') if isinstance(x, str) else []))


In [None]:

# Verify that lists were correctly created.
debug_display(df["Q4"].head(20))


### 3. Use a threshold merging function to get avoid extremely high cardinality 

In [None]:
import pandas as pd

def create_other_categories(df, cols, threshold=0.05):
    """
    Create 'Other' category for rare combinations in columns containing lists of categories.
    
    Parameters:
    - df: pd.DataFrame - Input DataFrame with categorical columns containing lists of categories.
    - cols: list - List of columns to process.
    - threshold: float - The minimum percentage of rows a combination should appear in to avoid being labeled as "Other".
    
    Returns:
    - df: pd.DataFrame - Updated DataFrame with 'Other' category for rare combinations.
    """
    for col in cols:
        debug_print(f"Processing column: {col}")

        # Flatten the lists into combinations of categories
        all_combinations = df[col].apply(lambda x: tuple(sorted(x)) if isinstance(x, list) else tuple([x]))
        
        # Calculate the frequency of each combination
        combination_counts = all_combinations.value_counts() / len(df)
        debug_print(f"Combination counts for {col}:\n{combination_counts.head(10)}")  # Show top 10 to inspect

        # Find combinations that appear less than the threshold
        rare_combinations = combination_counts[combination_counts < threshold].index
        debug_print(f"Combinations below threshold for {col}:\n{rare_combinations[:10]}")  # Print top 10 rare combinations

        # Apply 'Other' for rare combinations
        df[col] = df[col].apply(
            lambda x: [f"{col}_other" if tuple(sorted(x)) in rare_combinations else item for item in x]
        )

    return df



### 4. Use Target Mean Encoding with K-Fold Cross Validation to avoid data leakage.
Use smoothing to serve as regularization

In [None]:
# Define Mean Target Encoding with K-Fold Cross Validation 
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

# Global parameters for flexibility
FOLDS = 5
RANDOM_STATE = 42
SMOOTHING_FACTOR = 10  # You can adjust this value

def target_mean_encoder_kfold(df, target_col, cols_encode, folds=FOLDS, random_state=RANDOM_STATE, smoothing_factor=SMOOTHING_FACTOR):
    """
    Target Mean Encoding for nominal categorical variables with K-Fold Cross Validation.
    Handles multi-choice columns stored as lists and applies smoothing to avoid overfitting.

    Parameters:
    - df: pd.DataFrame - Input DataFrame with columns to encode and target variable.
    - target_col: str - The target column name.
    - cols_encode: list - List of column names to target encode.
    - folds: int - Number of K-Folds for Cross Validation.
    - random_state: int - Random seed for reproducibility.
    - smoothing_factor: float - Smoothing factor to control the regularization.

    Returns:
    - pd.DataFrame - A new DataFrame with encoded columns for verification.
    """
    df_new = df.copy().reset_index(drop=True)  # Reset index to ensure consistent indexing
    kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    
    # Ensure the target column is numeric
    df[target_col] = pd.to_numeric(df[target_col], errors='coerce')

    # Perform K-Fold Encoding for each column in TARGET_COLUMNS
    for col in cols_encode:
        # Initialize the encoded column
        df_new[col + "_mean_enc"] = np.nan
        
        global_mean = df[target_col].mean()  # Global mean for fallback
        
        # Perform K-Fold Encoding
        for train_idx, val_idx in kf.split(df):
            train_data, val_data = df.iloc[train_idx], df.iloc[val_idx]
            
            # Exploding lists into individual rows for encoding
            category_means = (
                train_data.explode(col)
                .groupby(col)[target_col]
                .mean()
            )
            
            # Calculate the count of categories in the training set
            category_counts = (
                train_data.explode(col)
                .groupby(col).size()
            )
            
            # Apply smoothing to category means
            smoothed_means = {
                cat: (mean * count + global_mean * smoothing_factor) / (count + smoothing_factor)
                for cat, mean, count in zip(category_means.index, category_means.values, category_counts.values)
            }

            # Map smoothed encodings for the validation fold using .iloc to avoid index errors
            df_new.iloc[val_idx, df_new.columns.get_loc(col + "_mean_enc")] = (
                val_data[col]
                .apply(lambda x: np.mean([smoothed_means.get(val, global_mean) for val in x]) if isinstance(x, list) else global_mean)
            )
        
        # Replace NaN values in the column with the global mean
        df_new[col + "_mean_enc"].fillna(global_mean, inplace=True)
    
    return df_new.filter(like="_mean_enc", axis=1)


In [None]:
# Apply the encoder function to the full DataFrame

# Create 'Other' category for rare values
df = create_other_categories(df, TARGET_COLUMNS, threshold=0.053)

# Apply encoder
df_new = target_mean_encoder_kfold(df, target_col="Q_target", cols_encode=TARGET_COLUMNS)

debug_print(df_new.head())


In [None]:
# Display the total NaN count in the entire DataFrame
total_nan_count = df.isna().sum().sum()
print(f"Total NaN values in the DataFrame: {total_nan_count}")


Total NaN values in the DataFrame: 0


In [None]:
# Display the count of unique values for each column to verify Threshold Grouping Function
debug_print(df_new.nunique())


In [None]:
# Analyze the frequency distribution of unique values in the 'Q21a_mean_enc' column
import matplotlib.pyplot as plt

column = 'Q21a_mean_enc'

# Calculate value counts and their relative frequencies
value_counts = df_new[column].value_counts()
relative_frequencies = value_counts / len(df)

# Plot the distribution of frequencies
# plt.figure(figsize=(10, 6))
# value_counts.plot(kind='bar', color='skyblue')
# plt.title(f"Frequency Distribution of {column}")
# plt.xlabel(f"{column} Categories")
# plt.ylabel("Count")
# plt.xticks(rotation=90)
# plt.show()

# Display the relative frequencies of each category
debug_print(f"Relative frequencies for {column}:")
debug_print(relative_frequencies)


In [None]:
# Display the total NaN count in the entire DataFrame
total_nan_count = df.isna().sum().sum()
print(f"Total NaN values in the DataFrame: {total_nan_count}")


Total NaN values in the DataFrame: 0


In [None]:
# Apply the encoded columns into the original DataFrame

encoded_columns = [
    'Q2_mean_enc', 'Q4_mean_enc', 'Q5_mean_enc', 'Q11_mean_enc', 'Q12_mean_enc', 'Q14_mean_enc', 
    'Q15_mean_enc', 'Q18a_mean_enc', 'Q18b_mean_enc', 'Q18c_mean_enc', 'Q18d_mean_enc', 'Q18e_mean_enc', 
    'Q18f_mean_enc', 'Q18g_mean_enc', 'Q18h_mean_enc', 'Q18i_mean_enc', 'Q18j_mean_enc', 'Q18k_mean_enc', 'Q21a_mean_enc', 
    'Q21b_mean_enc', 'Q21c_mean_enc', 'Q21d_mean_enc', 'Q21e_mean_enc', 'Q21f_mean_enc', 'Q21g_mean_enc', 
    'Q21h_mean_enc', 'Q21i_mean_enc', 'Q21j_mean_enc', 'Q21k_mean_enc', 'Q21l_mean_enc', 'Q22a_mean_enc', 
    'Q22b_mean_enc', 'Q22c_mean_enc', 'Q22d_mean_enc', 'Q22e_mean_enc', 'Q22f_mean_enc', 'Q22g_mean_enc', 
    'Q22h_mean_enc', 'Q22i_mean_enc', 'Q22j_mean_enc', 'Q22k_mean_enc', 'Q22l_mean_enc', 'Q24_mean_enc', 
    'Q29_mean_enc', 'Q34_mean_enc', 'Q42_mean_enc', 'Q43_mean_enc', 'Q56_mean_enc', 'Q57_mean_enc', 
    'Q58_mean_enc', 'Q70_mean_enc', 'Q79_mean_enc', 'Q88_mean_enc', 'Q123_mean_enc', 'Q125_mean_enc', 'Q135_mean_enc'
]

original_columns_to_replace = [
    'Q2', 'Q4', 'Q5', 'Q11', 'Q12', 'Q14', 'Q15', 'Q18a', 'Q18b', 'Q18c', 'Q18d', 'Q18e', 'Q18f', 
    'Q18g', 'Q18h', 'Q18i', 'Q18j', 'Q18k', 'Q21a', 'Q21b', 'Q21c', 'Q21d', 'Q21e', 'Q21f', 'Q21g', 'Q21h', 
    'Q21i', 'Q21j', 'Q21k', 'Q21l', 'Q22a', 'Q22b', 'Q22c', 'Q22d', 'Q22e', 'Q22f', 'Q22g', 'Q22h', 
    'Q22i', 'Q22j', 'Q22k', 'Q22l', 'Q24', 'Q29', 'Q34', 'Q42', 'Q43', 'Q56', 'Q57', 'Q58', 'Q70', 
    'Q79', 'Q88', 'Q123', 'Q125', 'Q135'
]

# Ensure that both DataFrames have the same index
if not df.index.equals(df_new.index):
    df_new = df_new.set_index(df.index)

# Check for any missing columns in df_new
missing_columns = set(encoded_columns) - set(df_new.columns)
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    # Only select the encoded columns that exist in df_new
    encoded_columns = [col for col in encoded_columns if col in df_new.columns]

# Concatenate df with the encoded columns from df_new
df = pd.concat([df, df_new[encoded_columns]], axis=1)

# Check for any new NaN values after concatenation
nan_counts = df[encoded_columns].isna().sum()
if nan_counts.any():
    print(f"NaN counts in encoded columns after concatenation:\n{nan_counts}")
else:
    print("No NaN values found in the encoded columns.")


No NaN values found in the encoded columns.


In [None]:
# Display the total NaN count in the entire DataFrame
total_nan_count = df_new.isna().sum().sum()
print(f"Total NaN values in the DataFrame: {total_nan_count}")

Total NaN values in the DataFrame: 0


In [None]:
# Display the total NaN count in the entire DataFrame
total_nan_count = df.isna().sum().sum()
print(f"Total NaN values in the DataFrame: {total_nan_count}")


Total NaN values in the DataFrame: 0


In [None]:
print(list(df.columns))

['Q1', 'Q2', 'Q3', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q13', 'Q15', 'Q16', 'Q17', 'Q19', 'Q20', 'Q23', 'Q25', 'Q26', 'Q27', 'Q28', 'Q30', 'Q31', 'Q32', 'Q33', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42m', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q51', 'Q52', 'Q53', 'Q54', 'Q55', 'Q58', 'Q59', 'Q60', 'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q66', 'Q67', 'Q68', 'Q69', 'Q71', 'Q72', 'Q73', 'Q74', 'Q75', 'Q76', 'Q77', 'Q78', 'Q80', 'Q81', 'Q82', 'Q83', 'Q84', 'Q85', 'Q86', 'Q87', 'Q89', 'Q90', 'Q91', 'Q92', 'Q93', 'Q94', 'Q95', 'Q96', 'Q97', 'Q98', 'Q99', 'Q101', 'Q102', 'Q103', 'Q104', 'Q105', 'Q106', 'Q107', 'Q108', 'Q109', 'Q110', 'Q111', 'Q112', 'Q113', 'Q114', 'Q115', 'Q116', 'Q117', 'Q118', 'Q119', 'Q120', 'Q121', 'Q122', 'Q124', 'Q125', 'Q127', 'Q128', 'Q129', 'Q130', 'Q131', 'Q132', 'Q133', 'Q134', 'Q138a', 'Q138b', 'Q138c', 'Q138d', 'Q138e', 'Q138f', 'Q138g', 'Q138h', 'Q138i', 'Q138j', 'Q138k', 'Q138l', 'Q138m', 'Q138n', 'Q138o', 'Q139a', 'Q139b', 'Q139c', 'Q139d', 'Q13

In [None]:
# Display the total NaN count in the entire DataFrame
total_nan_count = df.isna().sum().sum()
print(f"Total NaN values in the DataFrame: {total_nan_count}")


Total NaN values in the DataFrame: 0


### 5. Validate that the target encoding did not incur any data leakage.

In [None]:
def validate_target_encoding(df, target_col, cols_encode, folds=5, random_state=42):
    """
    Validate Target Mean Encoding to ensure there is no leakage from the validation fold.
    """
    kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    
    # Loop through each fold
    for train_idx, val_idx in kf.split(df):
        train_data, val_data = df.iloc[train_idx], df.iloc[val_idx]
        
        # Perform target encoding on the training data only
        category_means = {}
        for col in cols_encode:
            category_means[col] = train_data.groupby(col)[target_col].mean()
        
        # Encode the validation data using training data category means
        val_data_encoded = val_data.copy()
        
        # Check encoding for each column
        for col in cols_encode:
            val_data_encoded[col + "_mean_enc"] = val_data[col].map(category_means[col])
        
            # Check if any encoded values are leaking from the validation fold
            leakage_check = val_data_encoded[col + "_mean_enc"].equals(val_data[target_col])
            if leakage_check:
                debug_print(f"Potential data leakage detected for column: {col}")
            else:
                debug_print(f"No leakage detected for {col}")
            
            # Print the encoded values in the validation set to manually inspect
            debug_print(f"Encoded values for validation data (first 5 rows of {col}):")
            debug_print(val_data_encoded[[col, col + "_mean_enc"]].head())
        
    debug_print("Target encoding validation completed.")

# Use this function to validate target encoding
validate_target_encoding(df, target_col='Q_target', cols_encode=encoded_columns)

In [None]:
# Display the total NaN count in the entire DataFrame
total_nan_count = df.isna().sum().sum()
print(f"Total NaN values in the DataFrame: {total_nan_count}")


Total NaN values in the DataFrame: 0


# Export for Model Training

In [None]:
# Export to CSV
df.to_csv('pro_data.csv', index=False)

In [None]:
print(df.dtypes)


Q1                object
Q2                object
Q3                object
Q6                object
Q7                object
                  ...   
Q79_mean_enc     float64
Q88_mean_enc     float64
Q123_mean_enc    float64
Q125_mean_enc    float64
Q135_mean_enc    float64
Length: 261, dtype: object


In [None]:
df['Q1'].head()

0     5
4     4
5     6
7     1
8    10
Name: Q1, dtype: object