# Set-Up

In [1]:
import numpy as np
from kagglehub import KaggleDatasetAdapter, dataset_load
from dotenv import load_dotenv
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import umap
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Suppress UMAP job warning.
warnings.filterwarnings(
    action="ignore",
    message=".*n_jobs value 1 overridden.*"
)

In [2]:
_ = load_dotenv()

In [3]:
seed = 8675309

# Data-Fetching

In [4]:
# Type hint added because dataset_load returns Any, but will return a DataFrame in this case, as we're using the adapter. This is just for convenience from the IDE (auto-complete, typo-checking, etc).
df: pd.DataFrame = dataset_load(
    adapter=KaggleDatasetAdapter.PANDAS,
    handle="miroslavsabo/young-people-survey",
    path="responses.csv",
)

# Exploration

In [5]:
df.describe()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Shopping centres,Branded clothing,Entertainment spending,Spending on looks,Spending on gadgets,Spending on healthy eating,Age,Height,Weight,Number of siblings
count,1007.0,1008.0,1006.0,1005.0,1005.0,1003.0,1008.0,1007.0,1004.0,1007.0,...,1008.0,1008.0,1007.0,1007.0,1010.0,1008.0,1003.0,990.0,990.0,1004.0
mean,4.731877,3.328373,3.11332,2.288557,2.123383,2.956132,2.761905,3.471698,3.761952,2.36147,...,3.234127,3.050595,3.201589,3.106256,2.870297,3.55754,20.433699,173.514141,66.405051,1.297809
std,0.664049,0.833931,1.170568,1.138916,1.076136,1.25257,1.260845,1.1614,1.184861,1.372995,...,1.323062,1.306321,1.188947,1.205368,1.28497,1.09375,2.82884,10.024505,13.839561,1.013348
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,15.0,62.0,41.0,0.0
25%,5.0,3.0,2.0,1.0,1.0,2.0,2.0,3.0,3.0,1.0,...,2.0,2.0,2.0,2.0,2.0,3.0,19.0,167.0,55.0,1.0
50%,5.0,3.0,3.0,2.0,2.0,3.0,3.0,4.0,4.0,2.0,...,3.0,3.0,3.0,3.0,3.0,4.0,20.0,173.0,64.0,1.0
75%,5.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,5.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,22.0,180.0,75.0,2.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,30.0,203.0,165.0,10.0


In [6]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 150 columns):
 #    Column                          Non-Null Count  Dtype  
---   ------                          --------------  -----  
 0    Music                           1007 non-null   float64
 1    Slow songs or fast songs        1008 non-null   float64
 2    Dance                           1006 non-null   float64
 3    Folk                            1005 non-null   float64
 4    Country                         1005 non-null   float64
 5    Classical music                 1003 non-null   float64
 6    Musical                         1008 non-null   float64
 7    Pop                             1007 non-null   float64
 8    Rock                            1004 non-null   float64
 9    Metal or Hardrock               1007 non-null   float64
 10   Punk                            1002 non-null   float64
 11   Hiphop, Rap                     1006 non-null   float64
 12   Reggae, Ska       

Most of the columns actually appear to be integers, and of those, most are Likert (1-5). Some object types are actually strings, which haven't been converted to categorical types, and will need to be done manually, as some are ordinal, and the ordering was probably lost.

# Cleaning

## Simple type conversion

In [7]:
df = df.convert_dtypes()

In [8]:
df.describe()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Shopping centres,Branded clothing,Entertainment spending,Spending on looks,Spending on gadgets,Spending on healthy eating,Age,Height,Weight,Number of siblings
count,1007.0,1008.0,1006.0,1005.0,1005.0,1003.0,1008.0,1007.0,1004.0,1007.0,...,1008.0,1008.0,1007.0,1007.0,1010.0,1008.0,1003.0,990.0,990.0,1004.0
mean,4.731877,3.328373,3.11332,2.288557,2.123383,2.956132,2.761905,3.471698,3.761952,2.36147,...,3.234127,3.050595,3.201589,3.106256,2.870297,3.55754,20.433699,173.514141,66.405051,1.297809
std,0.664049,0.833931,1.170568,1.138916,1.076136,1.25257,1.260845,1.1614,1.184861,1.372995,...,1.323062,1.306321,1.188947,1.205368,1.28497,1.09375,2.82884,10.024505,13.839561,1.013348
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,15.0,62.0,41.0,0.0
25%,5.0,3.0,2.0,1.0,1.0,2.0,2.0,3.0,3.0,1.0,...,2.0,2.0,2.0,2.0,2.0,3.0,19.0,167.0,55.0,1.0
50%,5.0,3.0,3.0,2.0,2.0,3.0,3.0,4.0,4.0,2.0,...,3.0,3.0,3.0,3.0,3.0,4.0,20.0,173.0,64.0,1.0
75%,5.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,5.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,22.0,180.0,75.0,2.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,30.0,203.0,165.0,10.0


In [9]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 150 columns):
 #    Column                          Non-Null Count  Dtype 
---   ------                          --------------  ----- 
 0    Music                           1007 non-null   Int64 
 1    Slow songs or fast songs        1008 non-null   Int64 
 2    Dance                           1006 non-null   Int64 
 3    Folk                            1005 non-null   Int64 
 4    Country                         1005 non-null   Int64 
 5    Classical music                 1003 non-null   Int64 
 6    Musical                         1008 non-null   Int64 
 7    Pop                             1007 non-null   Int64 
 8    Rock                            1004 non-null   Int64 
 9    Metal or Hardrock               1007 non-null   Int64 
 10   Punk                            1002 non-null   Int64 
 11   Hiphop, Rap                     1006 non-null   Int64 
 12   Reggae, Ska                     

## Rebuilding categories

The string-typed columns will need to be checked for the values they contain. There are only 11, so a hybrid eyeballing is optimal.

In [10]:
print(
    df.select_dtypes(include="string")
        .apply(
            lambda col: f"{col.name}\n" + "\n".join(f"  {v}" for v in col.unique())
        )
        .str
        .cat(sep="\n\n")
)

Smoking
  never smoked
  tried smoking
  former smoker
  current smoker
  <NA>

Alcohol
  drink a lot
  social drinker
  never
  <NA>

Punctuality
  i am always on time
  i am often early
  i am often running late
  <NA>

Lying
  never
  sometimes
  only to avoid hurting someone
  everytime it suits me
  <NA>

Internet usage
  few hours a day
  most of the day
  less than an hour a day
  no time at all

Gender
  female
  male
  <NA>

Left - right handed
  right handed
  left handed
  <NA>

Education
  college/bachelor degree
  secondary school
  primary school
  masters degree
  doctorate degree
  currently a primary school pupil
  <NA>

Only child
  no
  yes
  <NA>

Village - town
  village
  city
  <NA>

House - block of flats
  block of flats
  house/bungalow
  <NA>


The columns appear to be very clean. The only issue is most of these columns are obviously ordinal, and that information was lost, so it has to be manually defined.

In [11]:
df["Smoking"] = df["Smoking"].astype(CategoricalDtype(
    categories=[
        "never smoked",
        "tried smoking",
        "former smoker",
        "current smoker",
    ],
    ordered=True,
))

df["Alcohol"] = df["Alcohol"].astype(CategoricalDtype(
    categories=[
        "never",
        "social drinker",
        "drink a lot",
    ],
    ordered=True,
))

df["Punctuality"] = df["Punctuality"].astype(CategoricalDtype(
    categories=[
        "i am often running late",
        "i am often early",
        "i am always on time",
    ],
    ordered=True,
))

df["Lying"] = df["Lying"].astype(CategoricalDtype(
    categories=[
        "never",
        "sometimes",
        "only to avoid hurting someone",
        "everytime it suits me",
    ],
    ordered=True,
))

df["Internet usage"] = df["Internet usage"].astype(CategoricalDtype(
    categories=[
        "no time at all",
        "less than an hour a day",
        "few hours a day",
        "most of the day",
    ],
    ordered=True,
))

df["Gender"] = df["Gender"].astype(CategoricalDtype(
    categories=[
        "female",
        "male",
    ],
    ordered=False,
))

df["Left - right handed"] = df["Left - right handed"].astype(CategoricalDtype(
    categories=[
        "left handed",
        "right handed",
    ],
    ordered=False,
))

df["Education"] = df["Education"].astype(CategoricalDtype(
    categories=[
        "currently a primary school pupil",
        "primary school",
        "secondary school",
        "college/bachelor degree",
        "masters degree",
        "doctorate degree",
    ],
    ordered=True,
))

df["Only child"] = df["Only child"].astype(CategoricalDtype(
    categories=[
        "no",
        "yes",
    ],
    ordered=False,
))

df["Village - town"] = df["Village - town"].astype(CategoricalDtype(
    categories=[
        "village",
        "city",
    ],
    ordered=False,
))

df["House - block of flats"] = df["House - block of flats"].astype(CategoricalDtype(
    categories=[
        "block of flats",
        "house/bungalow",
    ],
    ordered=False,
))

In [12]:
df.select_dtypes("category").describe()

Unnamed: 0,Smoking,Alcohol,Punctuality,Lying,Internet usage,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
count,1002,1005,1008,1008,1010,1004,1007,1009,1008,1006,1006
unique,4,3,3,4,4,2,2,6,2,2,2
top,tried smoking,social drinker,i am always on time,sometimes,few hours a day,female,right handed,secondary school,no,city,block of flats
freq,430,659,399,549,744,593,906,621,754,707,595


In [13]:
df.select_dtypes("category").info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Smoking                 1002 non-null   category
 1   Alcohol                 1005 non-null   category
 2   Punctuality             1008 non-null   category
 3   Lying                   1008 non-null   category
 4   Internet usage          1010 non-null   category
 5   Gender                  1004 non-null   category
 6   Left - right handed     1007 non-null   category
 7   Education               1009 non-null   category
 8   Only child              1008 non-null   category
 9   Village - town          1006 non-null   category
 10  House - block of flats  1006 non-null   category
dtypes: category(11)
memory usage: 12.7 KB


## Filling NA for clustering

### Splitting column types for different fill methods

In [14]:
# Integer columns that are non-Likert.
integer_cols = ["Age", "Height", "Weight", "Number of siblings"]

# Integer columns that are Likert.
likert_cols = list(df.select_dtypes("number").columns.difference(integer_cols))

category_cols = [c for c in df.columns if isinstance(df[c].dtype, CategoricalDtype)]

ordinal_cols = [c for c in category_cols if df[c].dtype.ordered]

nominal_cols = list(set(category_cols).difference(ordinal_cols))

### Fill based on column type

In [15]:
# We might want to revisit this to fill base on gender.

# Note: Converts all int columns to float columns.
# This allows for interpolated values.
df[integer_cols] = df[integer_cols]\
    .astype(float)\
    .fillna(df[integer_cols].median())

# Note: Converts all likert columns to float columns.
# This allows for interpolated values.
# We should consider probabilistic median rounding if normal rounding proves to be an issue.
df[likert_cols] = df[likert_cols]\
    .astype(float)\
    .fillna(df[likert_cols].median())

# Fill NA with the median response for each ordinal category, treating ordinals numerically.
df[ordinal_cols] = df[ordinal_cols].fillna(
    df[ordinal_cols].apply(
        lambda c: c.cat.categories[
            round(c.cat.codes.replace(-1, np.nan).median())
        ]
    )
)

df[nominal_cols] = df[nominal_cols].fillna(df[nominal_cols].mode().iloc[0])

### Check for NA

In [16]:
df.isna().sum().sum().item()

0

## Clustering

### Numeric Re-Encoding & Transformation

In [17]:
std_scaler = StandardScaler()
pca_likert = PCA(n_components=0.90)

df_trans = df\
    .assign(**{ # Convert ordinals to category integers.
        col: lambda d, c=col: d[c].cat.codes
        for col in ordinal_cols
    })\
    .pipe(lambda d: d.assign(**dict(zip( # Scale non-nominal columns.
        [c for c in d.columns if c not in nominal_cols],
        std_scaler.fit_transform(d.drop(columns=nominal_cols)).T
    ))))\
    .pipe(lambda d: pd.concat([ # Reduce non-nominal dimensions via pca.
        d.drop(columns=likert_cols),
        pd.DataFrame(
            pca_likert.fit_transform(d[likert_cols]),
            index=d.index,
        ).add_prefix("Likert_PC_")
    ], axis=1))\
    .pipe( # Convert nominals to one-hot.
        func=pd.get_dummies,
        columns=nominal_cols,
        prefix=nominal_cols,
        dtype=int
    )

### K-Means

#### Elbow Method for K

In [18]:
k_range = range(2, 25)
inertia = [
    KMeans(n_clusters=k, random_state=seed, n_init=10)
        .fit(df_trans)
        .inertia_
    for k in k_range
]

fig = px.line(
    x=k_range,
    y=inertia,
    title="Inertia vs. Number of Clusters for Elbow Method",
    labels=dict(
        x="Clusters",
        y="Inertia",
    )
)
fig.update_layout(margin=dict(l=0, r=0, b=0))
fig.show()

The elbow method shows no reasonable cut-off for number of clusters.

#### Alternate Silhouette Scores

In [19]:
k_range = range(2, 25)
sil_scores = [
    silhouette_score(
        X=df_trans,
        labels=KMeans(
            n_clusters=k,
            random_state=seed,
            n_init=10,
        ).fit_predict(df_trans),
    ) for k in k_range
]

fig = px.line(
    x=k_range,
    y=sil_scores,
    title="Silhouette Scores vs. Number of Clusters",
    labels=dict(
        x="Clusters",
        y="Silhouette Score",
    )
)
fig.update_layout(margin=dict(l=0, r=0, b=0))
fig.show()

Very low scores, which implies no clusters. Likely due to very high dimensionality.

### PCA

In [20]:
df_pca_3d = pd.DataFrame(
    data=PCA(n_components=3, random_state=seed).fit_transform(df_trans),
    columns=["PC1", "PC2", "PC3"],
)

fig = px.scatter_3d(
    df_pca_3d,
    x="PC1",
    y="PC2",
    z="PC3",
    opacity=0.8,
    title="PCA 3D Projection",
)
fig.update_traces(marker_size=2)
fig.update_layout(margin=dict(l=0, r=0, b=0))
fig.show()

PCA fails to distinguish clusters. Either there truly are no clusters, or the relationships are very non-linear.

### UMAP

In [21]:
umap_2d_embedding = umap\
    .UMAP(
        n_neighbors=30,
        min_dist=0.05,
        n_components=2,
        random_state=seed,
    )\
    .fit_transform(df_trans)

fig = px.scatter(
    x=umap_2d_embedding[:, 0],
    y=umap_2d_embedding[:, 1],
    title="UMAP 2D Projection",
    opacity=0.5,
)
fig.update_traces(marker_size=5)
fig.update_layout(margin=dict(l=0, r=0, b=0))
fig.show()

A blob. No clusters.

In [22]:
umap_3d_embedding = umap\
    .UMAP(
        n_neighbors=50,
        min_dist=0.05,
        n_components=3,
        random_state=seed,
    )\
    .fit_transform(df_trans)

fig = px.scatter_3d(
    x=umap_3d_embedding[:, 0],
    y=umap_3d_embedding[:, 1],
    z=umap_3d_embedding[:, 2],
    title="UMAP 3D Projection",
    opacity=0.6,
)
fig.update_traces(marker_size=2)
fig.update_layout(margin=dict(l=0, r=0, b=0))
fig.show()

Another blob. No clusters.

In [23]:
df_noise_mix = df_trans\
    .pipe(lambda d: pd.concat([
        d.assign(is_real=True),
        d.apply(lambda c: c.sample(frac=1).values)\
            .assign(is_real=False),
    ], ignore_index=True))

noise_mix_split = dict(zip(
    ("X_train", "X_test", "y_train", "y_test"),
    train_test_split(
        df_noise_mix.drop(columns="is_real"),
        df_noise_mix["is_real"],
        test_size=0.3,
        random_state=seed,
    )
))

accuracy_score(
    y_true=noise_mix_split["y_test"],
    y_pred=RandomForestClassifier(n_estimators=100, random_state=seed)\
        .fit(
            X=noise_mix_split["X_train"],
            y=noise_mix_split["y_train"],
        )\
        .predict(noise_mix_split["X_test"])
)

0.45874587458745875

Accuracy near 50% in a 50/50 noised set implies effectively no clustering. Time for a new dataset.