In [35]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


In [36]:
df_raw= pd.read_csv('data-final.csv', sep ='\t')

df_raw.head()

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,2016-03-03 02:01:01,768.0,1024.0,9.0,234.0,6,1,GB,51.5448,0.1991
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,2016-03-03 02:01:20,1360.0,768.0,12.0,179.0,11,1,MY,3.1698,101.706
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,2016-03-03 02:01:56,1366.0,768.0,3.0,186.0,7,1,GB,54.9119,-1.3833
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,2016-03-03 02:02:02,1920.0,1200.0,186.0,219.0,7,1,GB,51.75,-1.25
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,2016-03-03 02:02:57,1366.0,768.0,8.0,315.0,17,2,KE,1.0,38.0


In [37]:
MIN_VALID_PER_TRAIT = 8               # keep rows with >=8 valid answers in each trait
K_CANDIDATES = (3, 4, 5)              # candidates for silhouette-based k selection
MAX_N_FOR_SIL = 4000                  # sample size for silhouette (speed/robustness)


In [38]:
def cols_with_prefix(columns, prefix):
    """Return all column names that match prefix + integer index (e.g., EXT1..EXT10)."""
    rgx = re.compile(rf"^{re.escape(prefix)}\d+$")
    return [c for c in columns if rgx.match(c)]

def clamp_valid_1_to_5(df):
    """Coerce to numeric and set values outside [1,5] to NaN."""
    df = df.apply(pd.to_numeric, errors="coerce")
    return df.where((df >= 1) & (df <= 5))

In [39]:
# Detect item columns for each Big Five trait
ext_cols = cols_with_prefix(df_raw.columns, "EXT")  # Extraversion
est_cols = cols_with_prefix(df_raw.columns, "EST")  # Emotional Stability (→ N)
agr_cols = cols_with_prefix(df_raw.columns, "AGR")  # Agreeableness
csn_cols = cols_with_prefix(df_raw.columns, "CSN")  # Conscientiousness
opn_cols = cols_with_prefix(df_raw.columns, "OPN")  # Openness

question_cols = ext_cols + est_cols + agr_cols + csn_cols + opn_cols
if len(question_cols) == 0:
    raise ValueError("No Big Five item columns detected. Check your input file.")

In [40]:
# ---------------- 2) Clean items ----------------
# - keep only question columns
# - coerce to numeric
# - drop values outside 1..5
df_items = clamp_valid_1_to_5(df_raw[question_cols].copy())

# Row-level quality gate: each trait must have >= MIN_VALID_PER_TRAIT valid answers
mask = np.ones(len(df_items), dtype=bool)
for cols in [ext_cols, est_cols, agr_cols, csn_cols, opn_cols]:
    cnt = df_items[cols].notna().sum(axis=1)
    mask &= (cnt >= MIN_VALID_PER_TRAIT)

df_items_clean = df_items[mask].reset_index(drop=True)


In [41]:
# ---------------- 3) Compute OCEAN (original 1..5) ----------------
# E, A, C, O = means of corresponding item groups
# N = 6 - mean(EST)  (assuming original 1..5 scale)
E   = df_items_clean[ext_cols].mean(axis=1)
EST = df_items_clean[est_cols].mean(axis=1)
A   = df_items_clean[agr_cols].mean(axis=1)
C   = df_items_clean[csn_cols].mean(axis=1)
O   = df_items_clean[opn_cols].mean(axis=1)
N   = 6 - EST

df_ocean = pd.DataFrame({"O": O, "C": C, "E": E, "A": A, "N": N}).dropna().reset_index(drop=True)


In [42]:
# ---------------- 4) Min-Max scaling (0..1) ----------------
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df_ocean[["O","C","E","A","N"]])
df_scaled = pd.DataFrame(X_scaled, columns=["O","C","E","A","N"])

In [43]:
# ---------------- 5) Pick k via silhouette (sample) ----------------
if len(df_scaled) > MAX_N_FOR_SIL:
    rng = np.random.default_rng(42)
    idx = rng.choice(len(df_scaled), size=MAX_N_FOR_SIL, replace=False)
    X_sil = df_scaled.iloc[idx].to_numpy()
else:
    X_sil = df_scaled.to_numpy()

best_k, best_score = None, -1.0
for k in K_CANDIDATES:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = km.fit_predict(X_sil)
    sc = silhouette_score(X_sil, labels)
    if sc > best_score:
        best_k, best_score = k, sc

print(f"[k selection] best_k={best_k}, silhouette(sample)={best_score:.4f}")

[k selection] best_k=3, silhouette(sample)=0.1801


In [49]:
# # Silhouette Score Visualization (K selection)

# # assuming df_scaled is your MinMax-scaled OCEAN DataFrame
# X = df_scaled.values

# scores = {}
# for k in range(2, 11):   # try from 2 to 10 clusters
#     km = KMeans(n_clusters=k, n_init=10, random_state=42)
#     labels = km.fit_predict(X)
#     score = silhouette_score(X, labels)
#     scores[k] = score
#     print(f"k={k}, silhouette={score:.3f}")

# # visualize the result
# plt.figure(figsize=(8,5))
# plt.plot(list(scores.keys()), list(scores.values()), marker='o')
# plt.title("Silhouette Score by K")
# plt.xlabel("Number of Clusters (k)")
# plt.ylabel("Silhouette Score")
# plt.xticks(np.arange(2, 11, 1))
# plt.grid(True)
# plt.tight_layout()
# plt.show()

# best_k = max(scores, key=scores.get)
# print(f"\n Best k = {best_k}  (Silhouette Score = {scores[best_k]:.3f})")


KeyboardInterrupt: 

In [44]:
# ---------------- 6) Final K-Means on full data ----------------
kmeans = KMeans(n_clusters=best_k, n_init=10, random_state=42)
clusters = kmeans.fit_predict(df_scaled.to_numpy())

# Recover cluster centers on original 1..5 scale (inverse-transform)
centers_scaled = kmeans.cluster_centers_
centers_raw = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers_raw, columns=["O","C","E","A","N"])
centers_df["cluster"] = range(best_k)