In [None]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
database_name = 'processed_telecom'
table_name= 'xdr_data'

connection_params = { "host": "localhost", "user": "postgres", "password": "00000000",
                    "port": "5432", "database": database_name}

engine = create_engine(f"postgresql+psycopg2://{connection_params['user']}:{connection_params['password']}@{connection_params['host']}:{connection_params['port']}/{connection_params['database']}")

# str or SQLAlchemy Selectable (select or text object)
sql_query = 'SELECT * FROM xdr_data '

df = pd.read_sql(sql_query, con= engine)

In [None]:
df.info()

Aggregate information per customer


In [None]:
# Convert 'Handset Type' column to string type
df['Handset Type'] = df['Handset Type'].astype(str)

In [None]:
# Aggregate information per customer
aggregated_df = df.groupby('MSISDN/Number').agg({
    
    'TCP DL Retrans. Vol (Bytes)':'mean',
     'Avg RTT DL (ms)': 'mean',
       'Handset Type':lambda x:x.mode()[0],
        'Avg Bearer TP DL (kbps)': 'mean'
       }).reset_index()
print(aggregated_df.head())


In [None]:
# Compute top, bottom, and most frequent values
top_tcp_values = df['TCP DL Retrans. Vol (Bytes)'].nlargest(10)
bottom_tcp_values = df['TCP DL Retrans. Vol (Bytes)'].nsmallest(10)
most_frequent_tcp_values = df['TCP DL Retrans. Vol (Bytes)'].value_counts().head(10)
print("Top TCP values:")
print(top_tcp_values)

In [None]:
top_rtt_values = df['Avg RTT DL (ms)'].nlargest(10)
bottom_rtt_values = df['Avg RTT DL (ms)'].nsmallest(10)
most_frequent_rtt_values = df['Avg RTT DL (ms)'].value_counts().head(10)
print("Top RTT values:")
print(top_rtt_values)

In [None]:
top_throughput_values = df['Avg Bearer TP DL (kbps)'].nlargest(10)
bottom_throughput_values = df['Avg Bearer TP DL (kbps)'].nsmallest(10)
most_frequent_throughput_values = df['Avg Bearer TP DL (kbps)'].value_counts().head(10)
print("Top Throughput values:")
print(top_throughput_values)

In [None]:
# Compute distribution of average throughput per handset type
throughput_distribution = df.groupby('Handset Type')['Avg Bearer TP DL (kbps)'].mean()
print(throughput_distribution)

In [None]:
# Compute average TCP retransmission per handset type
tcp_retransmission_per_handset = df.groupby('Handset Type')['TCP DL Retrans. Vol (Bytes)'].mean()
print(tcp_retransmission_per_handset)

In [None]:
#Perform Clustering
kmeans = KMeans(n_clusters=3)  # Choose an appropriate value of k
df['cluster'] = kmeans.fit_predict(df[['Avg RTT DL (ms)', 'Avg Bearer TP DL (kbps)', 'TCP DL Retrans. Vol (Bytes)']]) 


In [None]:
# Check and impute missing values
df[['Avg RTT DL (ms)', 'Avg Bearer TP DL (kbps)', 'TCP DL Retrans. Vol (Bytes)']].fillna(df.mean(), inplace=True)

# Scaling the data
scaler = StandardScaler()
X = scaler.fit_transform(df[['Avg RTT DL (ms)', 'Avg Bearer TP DL (kbps)', 'TCP DL Retrans. Vol (Bytes)']])

# K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Description of clusters
cluster_descriptions = df.groupby('cluster').mean()

# Convert all values to strings
cluster_descriptions = cluster_descriptions.applymap(str)

print(cluster_descriptions)

In [None]:
print(df[['Avg RTT DL (ms)', 'Avg Bearer TP DL (kbps)', 'TCP DL Retrans. Vol (Bytes)']].dtypes)
