# Task 5 

Assign:
- engagement score to each user. 
Consider the engagement score as the Euclidean distance between the user data point & the less engaged cluster (use the first clustering for this) (Euclidean Distance)


In [3]:
# Load the data
from src.utils import fetch_data_from_db_table

df = fetch_data_from_db_table('clean_data')

In [5]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance

# Select only the columns with experience metrics
experience_metrics = ['Dur. (ms)', 'Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)']
df_experience = df[experience_metrics]

# Handle missing values if any
df_experience = df_experience.dropna()

# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_experience)

# Perform k-means clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(df_scaled)
# Convert df_scaled back to a DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=df_experience.columns)

# Add the cluster labels to the original DataFrame
df_experience['cluster'] = kmeans.labels_

# Identify the less engaged cluster
# This could be the cluster with the lowest average 'Dur. (ms)', 'Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', and 'Avg Bearer TP UL (kbps)'
# For example, if cluster 0 is the less engaged cluster
less_engaged_cluster = 0

# Get the centroid of the less engaged cluster
less_engaged_centroid = kmeans.cluster_centers_[less_engaged_cluster]

# Calculate the Euclidean distance between each user data point and the centroid of the less engaged cluster
df_experience['engagement_score'] = df_scaled.apply(lambda x: distance.euclidean(x, less_engaged_centroid), axis=1)

# Print the DataFrame with the engagement scores
print(df_experience)

           Dur. (ms)  Avg RTT DL (ms)  Avg RTT UL (ms)  \
0       7.060759e-03        -0.188482    -2.279256e-01   
1       7.060759e-03         0.731606    -2.279256e-01   
2       7.060759e-03         0.000000    -4.744955e-16   
3       7.060759e-03         0.000000    -4.744955e-16   
4       7.060759e-03         0.000000    -4.744955e-16   
...              ...              ...              ...   
149996 -9.072880e-02        -0.756899    -2.142365e+00   
149997  2.062895e-01        -1.109733    -9.685318e-01   
149998  2.107973e-01        -0.139113    -6.322041e-02   
149999  2.053184e-01        -0.453938    -2.279256e-01   
150000  2.815807e-15         0.000000    -4.744955e-16   

        Avg Bearer TP DL (kbps)  Avg Bearer TP UL (kbps)  cluster  \
0                     -0.960585                -0.733343        0   
1                     -1.068281                -0.979640        0   
2                     -1.345392                -1.458541        0   
3                     -0.76