## Loading modules and the dataset

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
import xgboost
import bs4
import re

In [2]:
# Dataset
original_df = pd.read_csv("data/train.csv", delimiter=",")

## Linking the questions labels to the questions title

In [None]:
html_file = "data/USCODE22_LLCP_102523.HTML"
out_csv = "data/labels_questions.csv"

# Reading HTML 
with open(html_file, encoding="latin-1") as f:
    html = f.read()

soup = bs4.BeautifulSoup(html, "html.parser")

# Extracting blocks of questions
questions = []
for td in soup.find_all("td", class_="l m linecontent"):
    text = td.get_text(separator=" ", strip=True)

    # Regex pour extraire Label, SAS Variable Name et Question
    label_match = re.search(r"Label:\s*(.*?)\s*(Section|SAS|Question|$)", text)
    sas_match = re.search(r"SAS\s*Variable\s*Name:\s*([A-Za-z0-9_]+)", text)

    questions.append({
        "Label": str(label_match.group(1).strip()) if label_match else "",
        "SAS_Variable_Name": str(sas_match.group(1).strip()) if sas_match else ""
        })

# Conversion to df
df = pd.DataFrame(questions)

# Saving labels df
df.to_csv(out_csv, index=False, encoding="utf-8")


In [11]:
# Checking for duplicates 
dup_counts = df['Label'].value_counts()
duplicates = dup_counts[dup_counts > 1]

print(f"Unique labels: {dup_counts.shape[0]}")
print(f"Duplicate labels: {duplicates.shape[0]}")

if duplicates.empty:
    print("No duplicates")
else:
    print("\nDuplicates & Occurrences:")
    print(duplicates)

    # Corresponding line
    dup_rows = df[df['Label'].isin(duplicates.index)].sort_values('Label').reset_index(drop=True)
    print("\nLines Duplicates:")
    print(dup_rows)

Unique labels: 317
Duplicate labels: 5

Duplicates & Occurrences:
Label
Are you male or female?              4
Do you live in college housing?      2
Number of Adults in Household        2
Are you 18 years of age or older?    2
Sexual orientation                   2
Name: count, dtype: int64

Lines Duplicates:
                                Label SAS_Variable_Name
0   Are you 18 years of age or older?           LADULT1
1   Are you 18 years of age or older?           CADULT1
2             Are you male or female?          COLGSEX1
3             Are you male or female?          LANDSEX1
4             Are you male or female?          CELLSEX1
5             Are you male or female?          BIRTHSEX
6     Do you live in college housing?          COLGHOUS
7     Do you live in college housing?          CCLGHOUS
8       Number of Adults in Household          NUMADULT
9       Number of Adults in Household           HHADULT
10                 Sexual orientation            SOMALE
11              