In [None]:
import pandas as pd

df = pd.read_csv("user_features.csv")
print(df.shape)
print(df.head())

In [None]:
# Select numerical features

X = df[['total_impressions', 'total_clicks', 'total_nonclicks',
        'ctr', 'pvalue_level', 'shopping_level']]

# Handle missing values (if any)
X = X.fillna(0)

In [None]:
import numpy as np
import pandas as pd

df['log_impr']  = np.log1p(df['total_impressions'])
df['log_click'] = np.log1p(df['total_clicks'])

# Apply 99th percentile clipping to suppress extreme outliers
for c in ['log_impr','log_click','ctr']:
    hi = df[c].quantile(0.99)
    lo = df[c].quantile(0.01)
    df[c] = df[c].clip(lo, hi)

features = ['log_impr','log_click','ctr','pvalue_level','shopping_level','age_level']
X = df[features].fillna(0)


In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)



In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

sse = []
K_range = range(2, 11)  # Try clustering with K values ranging from 2 to 10
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)

plt.plot(K_range, sse, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("SSE (inertia)")
plt.title("Elbow Method for Optimal k")
plt.show()

In [None]:
from sklearn.cluster import KMeans

# Set the number of clusters K = 5
# (you may also experiment with K = 3, 5, or 6 to compare results)

kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
df['cluster'] = kmeans.fit_predict(X_scaled)

print(df[['user_id','cluster']].head())


In [None]:
print(df['cluster'].value_counts())


In [None]:
print(df.groupby('cluster')[['total_impressions','total_clicks','ctr','pvalue_level','shopping_level']].mean())


In [None]:
import matplotlib.pyplot as plt

plt.scatter(df['total_impressions'], df['ctr'], c=df['cluster'], cmap='viridis', alpha=0.5)
plt.xlabel("Total Impressions")
plt.ylabel("CTR")
plt.title("User Segmentation by KMeans")
plt.show()


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
Z = pca.fit_transform(X_scaled)

plt.scatter(Z[:,0], Z[:,1], c=df['cluster'], cmap='viridis', s=5, alpha=0.5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Clusters visualized in PCA space")
plt.show()


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# PCA 3D dimensionality reduction
pca = PCA(n_components=3, random_state=42)
Z = pca.fit_transform(X_scaled)

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')

sc = ax.scatter(Z[:,0], Z[:,1], Z[:,2],
                c=df['cluster'], cmap='viridis', s=5, alpha=0.6)

ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
ax.set_title("Clusters visualized in 3D PCA space")

plt.colorbar(sc, ax=ax, label="Cluster")
plt.show()


In [None]:
print(Z.shape, df['cluster'].shape)


In [None]:
import plotly.io as pio
pio.renderers.default = "notebook"   # "colab" / "notebook_connected"


In [None]:


import plotly.io as pio
pio.renderers.default = "colab"

import plotly.express as px
from sklearn.decomposition import PCA

# PCA 3D dimensionality reduction
pca = PCA(n_components=3, random_state=42)
Z = pca.fit_transform(X_scaled)

fig = px.scatter_3d(
    x=Z[:,0], y=Z[:,1], z=Z[:,2],
    color=df['cluster'].astype(str),
    opacity=0.6,
    title="3D PCA User Clusters"
)
fig.show()


In [None]:
!pip install umap-learn


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import umap.umap_ as umap

# ========== random sampling ==========
sample_size = 20000
rng = np.random.default_rng(seed=42)   # 固定种子
idx = rng.choice(len(X_scaled), size=sample_size, replace=False)

X_sample = X_scaled[idx]
y_sample = df['cluster'].iloc[idx]

# ========== UMAP de-dimension ==========
reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    metric='euclidean',
    random_state=42
)

Z_umap = reducer.fit_transform(X_sample)

# ========== Visualization ==========
plt.figure(figsize=(8,6))
plt.scatter(Z_umap[:,0], Z_umap[:,1], c=y_sample, cmap='viridis', s=5, alpha=0.5)
plt.title("Clusters visualized by UMAP (sampled 20k users)")
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.show()
