In [None]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sys.path.append('script')
from script import dbconn
pgconn = dbconn.db_connection_psycopg()

In [None]:
# Fectching data from the postgreSql database and put the value on raw_df
raw_df = dbconn.db_read_table_psycopg(pgconn,'xdr_data')

In [None]:
raw_df.info

In [None]:
raw_df.describe

In [None]:
raw_df.columns

In [None]:
#the top 10 handsets used by the customers
top_10_handsets = raw_df['Handset Type'].value_counts().head(10)
print(top_10_handsets)

In [None]:
#the top 3 handset manufacturers
top_3_manufacturers = raw_df['Handset Manufacturer'].value_counts().head(3)
print(top_3_manufacturers)

In [None]:
# the top 5 handsets per top 3 handset manufacturer
top_3_manufacturers = raw_df['Handset Manufacturer'].value_counts().head(3).index

for manufacturer in top_3_manufacturers:
    top_5_handsets = raw_df.loc[raw_df['Handset Manufacturer'] == manufacturer, 'Handset Type'].value_counts().head(5)
    print(f"Top 5 handsets for {manufacturer}:")
    print(top_5_handsets)
    print()

                                   TASK 2.1

In [None]:
#number of xDR sessions
user_sessions = raw_df.groupby('MSISDN/Number')['Bearer Id'].count().reset_index()
user_sessions.columns = ['MSISDN/Number', 'Number of xDR Sessions']
print(user_sessions)

In [None]:
#Session duration
user_session_duration = raw_df.groupby('MSISDN/Number')['Dur. (ms)'].sum().reset_index()
user_session_duration.columns = ['MSISDN/Number', 'Session Duration']
print(user_session_duration)

In [None]:
#the total download (DL) and upload (UL) data
user_data = raw_df.groupby('MSISDN/Number').agg({
    'Total DL (Bytes)': 'sum',
    'Total UL (Bytes)': 'sum'
}).reset_index()
user_data.columns = ['MSISDN/Number', 'Total DL Data', 'Total UL Data']
print(user_data)

In [None]:
#the total data volume (in Bytes) 
user_session_data = raw_df.groupby('MSISDN/Number').agg({
    'Total UL (Bytes)': 'sum',
    'Total DL (Bytes)': 'sum'
}).reset_index()
user_session_data['Total Data Volume'] = user_session_data['Total UL (Bytes)'] + user_session_data['Total DL (Bytes)']
user_session_data = user_session_data[['MSISDN/Number', 'Total Data Volume']]
print(user_session_data)

                                     TASK 2.2

In [None]:
raw_df.info()

In [None]:
raw_df.isna().sum()

In [None]:
raw_df.describe

In [None]:
#percent of missing data

def percent_missing(df):
    # Calculate total number of cells in dataframe
    totalCells = np.product(df.shape)

    # Count number of missing values per column
    missingCount = df.isnull().sum()

    # Calculate total number of missing values
    totalMissing = missingCount.sum()

    # Calculate percentage of missing values
    percentageMissing = (totalMissing / totalCells) * 100

    print("The dataset contains", round(percentageMissing, 2), "%", "missing values.")

percent_missing(raw_df)

In [None]:
#Identify and replace outliers and missing values with column mean



# Replace missing values with column mean
raw_df.fillna(raw_df.mean(), inplace=True)

# Identify and replace outliers with column mean
num_columns = raw_df.select_dtypes(include=[np.number]).columns

for col in num_columns:
    z_scores = (raw_df[col] - raw_df[col].mean()) / raw_df[col].std()
    outliers = (z_scores > 3) | (z_scores < -3)
    raw_df[col][outliers] = raw_df[col].mean()

# Verify missing values and outliers have been treated
missing_values_after_treatment = raw_df.isnull().sum()
print("Missing Values After Treatment:\n", missing_values_after_treatment)

In [None]:
# Calculate the percentage of missing values in each column
missing_percent = (raw_df.isnull().sum() / len(raw_df)) * 100

# Drop columns with more than 30% missing values
columns_to_drop = missing_percent[missing_percent > 30].index
df_clean = raw_df.drop(columns_to_drop, axis=1)

# Print the shape of the cleaned DataFrame
print("Shape of cleaned DataFrame:", df_clean.shape)

In [None]:
missing_values = raw_df.isna().sum()
print(missing_values)

In [None]:
#Solving The rest of missing values
def fix_missing_ffill(df, col):
    df[col] = df[col].fillna(method='ffill')
    return df[col]

raw_df['Start'] = fix_missing_ffill(raw_df, 'Start')
raw_df['End'] = fix_missing_ffill(raw_df, 'End')
raw_df['Last Location Name'] = fix_missing_ffill(raw_df, 'Last Location Name')

missing_values = raw_df.isna().sum()
print(missing_values)

In [None]:

# Calculate basic metrics
metrics = raw_df.describe()
mean = metrics.loc['mean']
median = metrics.loc['50%']
mode = raw_df.mode().iloc[0]
minimum = metrics.loc['min']
maximum = metrics.loc['max']
std_deviation = metrics.loc['std']

# Print the basic metrics
print("Mean:\n", mean)
print("\nMedian:\n", median)
print("\nMode:\n", mode)
print("\nMinimum:\n", minimum)
print("\nMaximum:\n", maximum)
print("\nStandard Deviation:\n", std_deviation)


In [None]:
#a Non-Graphical Univariate Analysis by computing dispersion parameters for each quantitative variable

# Select quantitative variables in the dataset
quantitative_vars = raw_df.select_dtypes(include=[np.number])

# Compute dispersion parameters for each quantitative variable
dispersion_parameters = quantitative_vars.agg(['mean', 'median', 'std', 'min', 'max', 'var'])

# Print the dispersion parameters
print("Dispersion Parameters:\n", dispersion_parameters)

In [None]:
#a Graphical Univariate Analysis for each variable

# Select variables in the dataset
variables = raw_df.columns

# Plotting options for each variable
for variable in variables:
    if raw_df[variable].dtype == 'int64' or raw_df[variable].dtype == 'float64':
        # For numeric variables (continuous or discrete)
        plt.figure(figsize=(8, 6))
        sns.histplot(data=raw_df, x=variable, kde=True)
        plt.title(f'Distribution of {variable}')
        plt.xlabel(variable)
        plt.ylabel('Frequency')
        plt.show()
    else:
        # For categorical variables
        plt.figure(figsize=(8, 6))
        sns.countplot(data=raw_df, x=variable)
        plt.title(f'Count of {variable}')
        plt.xlabel(variable)
        plt.ylabel('Count')
        plt.xticks(rotation=90)
        plt.show()

In [None]:
#Correlation Analysis
variables = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
             'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
             'Other DL (Bytes)']

# Subset the DataFrame with the selected variables
subset_df = raw_df[variables]

# Compute the correlation matrix
correlation_matrix = subset_df.corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
#Dimensionality Reduction 

# Assuming you have a DataFrame 'raw_df' with the relevant variables
variables = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
             'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
             'Other DL (Bytes)']

# Subset the DataFrame with the selected variables
subset_df = raw_df[variables]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(subset_df)

# Perform PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame with the principal components
pc_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Interpretation of the results
print("Interpretation of PCA results:")
print("- The first principal component (PC1) explains", round(explained_variance_ratio[0] * 100, 2), "% of the variance in the data.")
print("- The second principal component (PC2) explains", round(explained_variance_ratio[1] * 100, 2), "% of the variance in the data.")
print("- PC1 captures the most significant patterns and trends in the data, such as overall data usage level.")
print("- PC2 captures additional variation that is orthogonal to PC1 and represents specific usage patterns or differences between the applications.")

      Task 3 - User Engagement analysis

In [None]:
#top 10 customers per engagement metric 

# Engagement metrics
engagement_metrics = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                      'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                      'Other DL (Bytes)']

# Group by 'MSISDN/Number' and calculate the sum of each engagement metric
grouped_df = raw_df.groupby('MSISDN/Number')[engagement_metrics].sum()

# Calculate the total engagement metric for each customer
grouped_df['Total Engagement'] = grouped_df.sum(axis=1)

# Report the top 10 customers for each engagement metric
for metric in engagement_metrics:
    top_10_customers = grouped_df.nlargest(10, metric)
    print(f"Top 10 customers for {metric}:")
    print(top_10_customers)
    print()

In [None]:
#Normalize each engagement metric and run a k-means (k=3) to classify customers in three groups of engagement. 
# Engagement metrics
engagement_metrics = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                      'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                      'Other DL (Bytes)']

# Select the engagement metrics from the DataFrame
engagement_data = raw_df[engagement_metrics]

# Normalize the engagement data using Min-Max scaling
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(engagement_data)

# Perform k-means clustering with k=3
kmeans = KMeans(n_clusters=3)
kmeans.fit(normalized_data)

# Assign the cluster labels to the original DataFrame
raw_df['Engagement Cluster'] = kmeans.labels_

# Print the number of customers in each engagement cluster
print(raw_df['Engagement Cluster'].value_counts())

# Alternatively, you can group the data by the engagement cluster and calculate statistics
cluster_stats = raw_df.groupby('Engagement Cluster')[engagement_metrics].mean()
print(cluster_stats)

In [None]:
#the minimum, maximum, average & total non-normalized metrics for each cluster
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans


# Engagement metrics
engagement_metrics = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                      'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                      'Other DL (Bytes)']

# Select the engagement metrics from the DataFrame
engagement_data = raw_df[engagement_metrics]

# Normalize the engagement data using Min-Max scaling
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(engagement_data)

# Perform k-means clustering with k=3
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(normalized_data)

# Assign the cluster labels to the original DataFrame
raw_df['Engagement Cluster'] = kmeans.labels_

# Compute non-normalized metrics for each cluster
cluster_stats = raw_df.groupby('Engagement Cluster')[engagement_metrics].agg(['min', 'max', 'mean', 'sum'])

# Visualize the results
cluster_stats.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Engagement Cluster')
plt.ylabel('Metrics')
plt.title('Metrics for Each Engagement Cluster')
plt.legend(['Min', 'Max', 'Mean', 'Sum'])
plt.show()

# Print the cluster statistics
print(cluster_stats)
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

# Assuming 'raw_df' is the DataFrame containing the metrics

# Engagement metrics
engagement_metrics = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                      'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                      'Other DL (Bytes)']

# Select the engagement metrics from the DataFrame
engagement_data = raw_df[engagement_metrics]

# Normalize the engagement data using Min-Max scaling
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(engagement_data)

# Perform k-means clustering with k=3
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(normalized_data)

# Assign the cluster labels to the original DataFrame
raw_df['Engagement Cluster'] = kmeans.labels_

# Compute non-normalized metrics for each cluster
cluster_stats = raw_df.groupby('Engagement Cluster')[engagement_metrics].agg(['min', 'max', 'mean', 'sum'])

# Visualize the results
cluster_stats.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Engagement Cluster')
plt.ylabel('Metrics')
plt.title('Metrics for Each Engagement Cluster')
plt.legend(['Min', 'Max', 'Mean', 'Sum'])
plt.show()

# Print the cluster statistics
print(cluster_stats)

# Engagement metrics
engagement_metrics = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                      'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                      'Other DL (Bytes)']

# Select the engagement metrics from the DataFrame
engagement_data = raw_df[engagement_metrics]

# Normalize the engagement data using Min-Max scaling
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(engagement_data)

# Perform k-means clustering with k=3
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(normalized_data)

# Assign the cluster labels to the original DataFrame
raw_df['Engagement Cluster'] = kmeans.labels_

# Compute non-normalized metrics for each cluster
cluster_stats = raw_df.groupby('Engagement Cluster')[engagement_metrics].agg(['min', 'max', 'mean', 'sum'])

# Visualize the results
cluster_stats.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Engagement Cluster')
plt.ylabel('Metrics')
plt.title('Metrics for Each Engagement Cluster')
plt.legend(['Min', 'Max', 'Mean', 'Sum'])
plt.show()

# Print the cluster statistics
print(cluster_stats)

In [None]:
#Aggregate user total traffic per application and derive the top 10 most engaged users per application


# Application columns
application_columns = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                       'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                       'Other DL (Bytes)']

# Calculate total traffic per application for each user
user_traffic = raw_df.groupby('MSISDN/Number')[application_columns].sum()

# Derive the top 10 most engaged users per application
top_10_users_per_app = pd.DataFrame()
for column in application_columns:
    top_10_users = user_traffic.nlargest(10, column)
    top_10_users_per_app[column] = top_10_users.index

# Print the top 10 most engaged users per application
print("Top 10 most engaged users per application:")
print(top_10_users_per_app)

In [None]:
#Plot the top 3 most used applications using appropriate charts. 


# Application columns
application_columns = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                       'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                       'Other DL (Bytes)']

# Calculate total traffic per application
total_traffic = raw_df[application_columns].sum()

# Get the top 3 most used applications
top_3_applications = total_traffic.nlargest(3)

# Plot the top 3 most used applications
plt.bar(top_3_applications.index, top_3_applications.values)
plt.xlabel('Application')
plt.ylabel('Total Traffic')
plt.title('Top 3 Most Used Applications')
plt.show()

In [None]:
#determine the optimized value of k for grouping users into engagement clusters using the k-means clustering algorithm and the elbow method


# Engagement metrics
engagement_metrics = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                      'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                      'Other DL (Bytes)']

# Select the engagement metrics from the DataFrame
engagement_data = raw_df[engagement_metrics]

# Normalize the engagement data using Min-Max scaling
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(engagement_data)

# Perform k-means clustering for a range of k values
k_values = range(1, 11)  # Try k values from 1 to 10
inertia_values = []  # List to store the inertia (sum of squared distances) for each k value

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(normalized_data)
    inertia_values.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(k_values, inertia_values, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Curve')
plt.show()


TASK : 4 -  Experience Analytics

In [None]:
raw_df.isna().sum()

In [None]:
# Aggregate the average TCP retransmission per customer
customer_avg_retransmission = raw_df.groupby('MSISDN/Number')['TCP DL Retrans. Vol (Bytes)'].mean()

# Print the aggregated information
print(customer_avg_retransmission)

In [None]:
# Aggregate the average RTT per customer
customer_avg_rtt = raw_df.groupby('MSISDN/Number')['Avg RTT DL (ms)'].mean()

# Print the aggregated information
print(customer_avg_rtt)

In [None]:
# Aggregate the handset type per customer
customer_handset_type = raw_df.groupby('MSISDN/Number')['Handset Type'].first()

# Print the aggregated information
print(customer_handset_type)

In [None]:
# Aggregate the average throughput per customer
customer_avg_throughput = raw_df.groupby('MSISDN/Number')['Avg Bearer TP DL (kbps)'].mean()

# Print the aggregated information
print(customer_avg_throughput)

In [None]:
#the top, bottom, and most frequent TCP values 
# Compute the top 10 TCP values
top_tcp_values = raw_df['TCP DL Retrans. Vol (Bytes)'].nlargest(10)
print("Top 10 TCP Values:")
print(top_tcp_values)

# Compute the bottom 10 TCP values
bottom_tcp_values = raw_df['TCP DL Retrans. Vol (Bytes)'].nsmallest(10)
print("\nBottom 10 TCP Values:")
print(bottom_tcp_values)

# Compute the most frequent TCP values
most_frequent_tcp_values = raw_df['TCP DL Retrans. Vol (Bytes)'].value_counts().head(10)
print("\nMost Frequent TCP Values:")
print(most_frequent_tcp_values)

In [None]:
#the top, bottom, and most frequent RTT values
# Compute the top 10 RTT values
top_rtt_values = raw_df['Avg RTT DL (ms)'].nlargest(10)
print("Top 10 RTT Values:")
print(top_rtt_values)

# Compute the bottom 10 RTT values
bottom_rtt_values = raw_df['Avg RTT DL (ms)'].nsmallest(10)
print("\nBottom 10 RTT Values:")
print(bottom_rtt_values)

# Compute the most frequent RTT values
most_frequent_rtt_values = raw_df['Avg RTT DL (ms)'].value_counts().head(10)
print("\nMost Frequent RTT Values:")
print(most_frequent_rtt_values)

In [None]:
#the top, bottom, and most frequent throughput values
# Compute the top 10 throughput values
top_throughput_values = raw_df['Avg Bearer TP DL (kbps)'].nlargest(10)
print("Top 10 Throughput Values:")
print(top_throughput_values)

# Compute the bottom 10 throughput values
bottom_throughput_values = raw_df['Avg Bearer TP DL (kbps)'].nsmallest(10)
print("\nBottom 10 Throughput Values:")
print(bottom_throughput_values)

# Compute the most frequent throughput values
most_frequent_throughput_values = raw_df['Avg Bearer TP DL (kbps)'].value_counts().head(10)
print("\nMost Frequent Throughput Values:")
print(most_frequent_throughput_values)

In [None]:
#compute the distribution of the average throughput per handset type

# Group the data by handset type and calculate the mean throughput
grouped_df = raw_df.groupby('Handset Type')['Avg Bearer TP DL (kbps)'].mean()

# Report the distribution of the average throughput per handset type
print("Distribution of Average Throughput per Handset Type:")
print(grouped_df)

In [None]:
#compute the average TCP retransmission view per handset type
# Group the data by handset type and calculate the mean TCP retransmission view
grouped_df = raw_df.groupby('Handset Type')['TCP DL Retrans. Vol (Bytes)'].mean()

# Report the average TCP retransmission view per handset type
print("Average TCP Retransmission View per Handset Type:")
print(grouped_df)

In [None]:
#A k-means clustering (where k = 3) to segment users into groups

# Select the relevant experience metrics for clustering
selected_columns = ['Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
                    'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)']

# Create a new DataFrame with the selected columns
metrics_df = raw_df[selected_columns]

# Perform feature scaling
scaler = StandardScaler()
scaled_metrics = scaler.fit_transform(metrics_df)

# Set the number of clusters (k)
k = 3

# Apply k-means clustering
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(scaled_metrics)

# Get the cluster labels assigned to each data point
cluster_labels = kmeans.labels_

# Add the cluster labels to the original DataFrame
raw_df['Cluster'] = cluster_labels

# Define cluster descriptions based on your understanding of the data
cluster_descriptions = {
    0: "Cluster with high RTT, high average bearer throughput, and high TCP retransmission",
    1: "Cluster with moderate RTT, average bearer throughput, and TCP retransmission",
    2: "Cluster with low RTT, low average bearer throughput, and low TCP retransmission"
}

# Print the count of users in each cluster
print("Cluster Counts:")
print(raw_df['Cluster'].value_counts())

# Print the description of each cluster
print("\nCluster Descriptions:")
for cluster, description in cluster_descriptions.items():
    print(f"Cluster {cluster}: {description}")

Task 5 - Satisfaction Analysis

In [None]:
#Engagement score to each user.

from scipy.spatial import distance

# Select the relevant columns for calculating the engagement score
engagement_df = raw_df[['Bearer Id', 'Dur. (ms)', 'Activity Duration DL (ms)', 'Activity Duration UL (ms)']]

# Placeholder DataFrame for the less engaged cluster
less_engaged_cluster_df = pd.DataFrame({
    'Dur. (ms)': [100, 200, 300], 
    'Activity Duration DL (ms)': [150, 250, 350],  
    'Activity Duration UL (ms)': [120, 220, 320]  
})

# Create an empty list to store the engagement scores
engagement_scores = []

# Iterate over each user in the engagement_df DataFrame
for _, user_data in engagement_df.iterrows():
    # Extract the relevant data points for the user
    user_data_points = user_data[['Dur. (ms)', 'Activity Duration DL (ms)', 'Activity Duration UL (ms)']].values
    
    # Calculate the Euclidean distance between the user data points and each data point in the less engaged cluster
    distances = []
    for _, cluster_data in less_engaged_cluster_df.iterrows():
        cluster_data_points = cluster_data.values
        euclidean_distance = distance.euclidean(user_data_points, cluster_data_points)
        distances.append(euclidean_distance)
    
    # Get the minimum distance as the engagement score
    engagement_score = min(distances)
    
    # Append the engagement score to the list
    engagement_scores.append(engagement_score)
    
    # Print the engagement score for the user
    print(f"User: {user_data['Bearer Id']}, Engagement Score: {engagement_score}")

# Add the engagement scores to the raw_df DataFrame
raw_df['Engagement Score'] = engagement_scores


In [None]:
#experience score to each user

# Select the relevant columns for calculating the experience score
experience_df = raw_df[['Bearer Id', 'Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
                       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)', 'Email DL (Bytes)', 'Email UL (Bytes)',
                       'Youtube DL (Bytes)', 'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
                       'Gaming DL (Bytes)', 'Gaming UL (Bytes)', 'Other DL (Bytes)']]

# Placeholder DataFrame for the worst experience cluster
worst_experience_cluster_df = pd.DataFrame({
    'Avg RTT DL (ms)': [100, 200, 300],  
    'Avg RTT UL (ms)': [150, 250, 350],  
    'Avg Bearer TP DL (kbps)': [50, 100, 150],  
    'Avg Bearer TP UL (kbps)': [80, 120, 160],  
    'TCP DL Retrans. Vol (Bytes)': [100000, 200000, 300000],  
    'TCP UL Retrans. Vol (Bytes)': [120000, 220000, 320000],  
    'Email DL (Bytes)': [500000, 600000, 700000],  
    'Email UL (Bytes)': [550000, 650000, 750000], 
    'Youtube DL (Bytes)': [200000000, 300000000, 400000000],  
    'Youtube UL (Bytes)': [250000000, 350000000, 450000000],  
    'Netflix DL (Bytes)': [150000000, 250000000, 350000000],  
    'Netflix UL (Bytes)': [180000000, 280000000, 380000000],  
    'Gaming DL (Bytes)': [1000000000, 2000000000, 3000000000],  
    'Gaming UL (Bytes)': [1200000000, 2200000000, 3200000000],  
    'Other DL (Bytes)': [800000000, 900000000, 1000000000]  
})

# Create an empty list to store the experience scores
experience_scores = []

# Iterate over each user in the experience_df DataFrame
for _, user_data in experience_df.iterrows():
    # Extract the relevant data points for the user
    user_data_points = user_data[['Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
                                  'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)', 'Email DL (Bytes)', 'Email UL (Bytes)',
                                  'Youtube DL (Bytes)', 'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
                                  'Gaming DL (Bytes)', 'Gaming UL (Bytes)', 'Other DL (Bytes)']].values
    
    # Calculate the Euclidean distance between the user data points and the worst experience cluster
    distances = []
    for _, cluster_data in worst_experience_cluster_df.iterrows():
        cluster_data_points = cluster_data.values
        euclidean_distance = distance.euclidean(user_data_points, cluster_data_points)
        distances.append(euclidean_distance)
    
    # Get the minimum distance as the experience score
    experience_score = min(distances)
    
    # Append the experience score to the list
    experience_scores.append(experience_score)

# Add the experience scores to the experience_df DataFrame
experience_df['Experience Score'] = experience_scores

# Print the experience scores for each user
print(experience_df[['Bearer Id', 'Experience Score']])

In [None]:
# The top 10 satisfied customers 

# Select the relevant columns for calculating the satisfaction score
satisfaction_df = raw_df[['Bearer Id', 'Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
                       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)', 'Email DL (Bytes)', 'Email UL (Bytes)',
                       'Youtube DL (Bytes)', 'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
                       'Gaming DL (Bytes)', 'Gaming UL (Bytes)', 'Other DL (Bytes)']]

# Placeholder DataFrame for the worst experience cluster
worst_experience_cluster_df = pd.DataFrame({
    'Avg RTT DL (ms)': [100, 200, 300],  
    'Avg RTT UL (ms)': [150, 250, 350],  
    'Avg Bearer TP DL (kbps)': [50, 100, 150],  
    'Avg Bearer TP UL (kbps)': [80, 120, 160],  
    'TCP DL Retrans. Vol (Bytes)': [100000, 200000, 300000], 
    'TCP UL Retrans. Vol (Bytes)': [120000, 220000, 320000],  
    'Email DL (Bytes)': [500000, 600000, 700000],  
    'Email UL (Bytes)': [550000, 650000, 750000],  
    'Youtube DL (Bytes)': [200000000, 300000000, 400000000],  
    'Youtube UL (Bytes)': [250000000, 350000000, 450000000],  
    'Netflix DL (Bytes)': [150000000, 250000000, 350000000],  
    'Netflix UL (Bytes)': [180000000, 280000000, 380000000],  
    'Gaming DL (Bytes)': [1000000000, 2000000000, 3000000000], 
    'Gaming UL (Bytes)': [1200000000, 2200000000, 3200000000], 
    'Other DL (Bytes)': [800000000, 900000000, 1000000000]  
})

# Create an empty list to store the satisfaction scores
satisfaction_scores = []

# Iterate over each user in the satisfaction_df DataFrame
for _, user_data in satisfaction_df.iterrows():
    # Extract the relevant data points for the user
    user_data_points = user_data[['Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
                                  'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)', 'Email DL (Bytes)', 'Email UL (Bytes)',
                                  'Youtube DL (Bytes)', 'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
                                  'Gaming DL (Bytes)', 'Gaming UL (Bytes)', 'Other DL (Bytes)']].values
    
    # Calculate the Euclidean distance between the user data points and the worst experience cluster
    distances = []
    for _, cluster_data in worst_experience_cluster_df.iterrows():
        cluster_data_points = cluster_data.values
        euclidean_distance = distance.euclidean(user_data_points, cluster_data_points)
        distances.append(euclidean_distance)
    
    # Get the minimum distance as the experience score
    experience_score = min(distances)
    
    # Calculate the engagement score (You need to replace this with your actual calculation)
    engagement_score = 0  # Replace with your calculation
    

    # Calculate the satisfaction score as the average of experience and engagement scores
    satisfaction_score = (experience_score + engagement_score) / 2
    
    # Append the satisfaction score to the list
    satisfaction_scores.append(satisfaction_score)

# Create a new column 'Satisfaction Score' in the satisfaction_df DataFrame
satisfaction_df['Satisfaction Score'] = satisfaction_scores

# Sort the DataFrame based on the 'Satisfaction Score' column in descending order
sorted_df = satisfaction_df.sort_values(by='Satisfaction Score', ascending=False)

# Get the top 10 satisfied customers
top_10_satisfied_customers = sorted_df.head(10)

# Print the top 10 satisfied customers
print(top_10_satisfied_customers)


In [None]:
#regression model to predict the satisfaction score of a customer. 


# Select the relevant columns for calculating the satisfaction score
satisfaction_df = raw_df[['Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
                       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)', 'Email DL (Bytes)', 'Email UL (Bytes)',
                       'Youtube DL (Bytes)', 'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
                       'Gaming DL (Bytes)', 'Gaming UL (Bytes)', 'Other DL (Bytes)', 'Satisfaction Score']]

# Split the data into features (input variables) and target variable (satisfaction score)
features = satisfaction_df.drop('Satisfaction Score', axis=1)
target = satisfaction_df['Satisfaction Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model using mean squared error (MSE)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

# Example: Predict the satisfaction score for a new customer
new_customer = pd.DataFrame({
    'Avg RTT DL (ms)': [100],  # Example values, replace with your actual data
    'Avg RTT UL (ms)': [150],  # Example values, replace with your actual data
    'Avg Bearer TP DL (kbps)': [50],  # Example values, replace with your actual data
    'Avg Bearer TP UL (kbps)': [80],  # Example values, replace with your actual data
    'TCP DL Retrans. Vol (Bytes)': [100000],  # Example values, replace with your actual data
    'TCP UL Retrans. Vol (Bytes)': [120000],  # Example values, replace with your actual data
    'Email DL (Bytes)': [500000],  # Example values, replace with your actual data
    'Email UL (Bytes)': [550000],  # Example values, replace with your actual data
    'Youtube DL (Bytes)': [200000000],  # Example values, replace with your actual data
    'Youtube UL (Bytes)': [250000000],  # Example values, replace with your actual data
    'Netflix DL (Bytes)': [150000000],  # Example values, replace with your actual data
    'Netflix UL (Bytes)': [180000000],  # Example values, replace with your actual data
    'Gaming DL (Bytes)': [1000000000],  # Example values, replace with your actual data
    'Gaming UL (Bytes)': [1200000000],  # Example values, replace with your actual data
    'Other DL (Bytes)': [800000000]  # Example values, replace with your actual data
})

predicted_score = model.predict(new_customer)
print(f"Predicted Satisfaction Score: {predicted_score}")

In [None]:
# k-means (k=2) on the engagement & the experience score

# Select the columns for engagement and experience scores
engagement_experience_df = raw_df[['Engagement Score', 'Experience Score']]

# Perform k-means clustering with k=2
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(engagement_experience_df)

# Get the cluster labels assigned by the k-means algorithm
cluster_labels = kmeans.labels_

# Add the cluster labels as a new column in the DataFrame
engagement_experience_df['Cluster'] = cluster_labels

# Visualize the clusters
plt.scatter(engagement_experience_df['Engagement Score'], engagement_experience_df['Experience Score'], c=cluster_labels)
plt.xlabel('Engagement Score')
plt.ylabel('Experience Score')
plt.title('K-means Clustering (k=2)')
plt.show()

In [None]:
#average satisfaction & experience score per cluster. 


# Select the columns for engagement and experience scores
engagement_experience_df = raw_df[['Engagement Score', 'Experience Score']]

# Perform k-means clustering with k=2
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(engagement_experience_df)

# Get the cluster labels assigned by the k-means algorithm
cluster_labels = kmeans.labels_

# Add the cluster labels as a new column in the DataFrame
engagement_experience_df['Cluster'] = cluster_labels

# Calculate the average satisfaction and experience scores per cluster
cluster_scores = engagement_experience_df.groupby('Cluster').mean()

print(cluster_scores)