In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [29]:
# Load dataset
df = pd.read_csv("../data/raw/Quality of Service 5G.csv")

# 5 random samples from the dataset
print(df.sample(5))

# Display basic information about the dataset
print(df.info())
print("-" * 100 )
print(df.describe())

          Timestamp   User_ID Application_Type Signal_Strength Latency  \
196  9/3/2023 10:03  User_197       Video_Call         -62 dBm   28 ms   
32   9/3/2023 10:00   User_33     Web_Browsing         -88 dBm   27 ms   
20   9/3/2023 10:00   User_21       Video_Call         -77 dBm   31 ms   
373  9/3/2023 10:06  User_374  Video_Streaming        -110 dBm   55 ms   
150  9/3/2023 10:02  User_151  Video_Streaming         -82 dBm   30 ms   

    Required_Bandwidth Allocated_Bandwidth Resource_Allocation  
196          12.0 Mbps           13.3 Mbps                 75%  
32            1.3 Mbps            1.3 Mbps                 70%  
20             11 Mbps             13 Mbps                 75%  
373           1.3 Mbps            1.7 Mbps                 70%  
150           4.0 Mbps            4.4 Mbps                 70%  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------ 

In [30]:
# Remove the  "_" from column names
df.columns = df.columns.str.replace('_', ' ')

# Remove "_" in Application Type values
df['Application Type'] = df['Application Type'].str.replace('_', ' ')

In [31]:
# Drop redundant columns
df = df.drop(columns=["Timestamp", "User ID"])

# Convert object columns to numerical
df['Signal Strength'] = df['Signal Strength'].str.replace(' dBm', '').astype(float)
df['Latency'] = df['Latency'].str.replace(' ms', '').astype(float)
df['Resource Allocation'] = df['Resource Allocation'].str.replace('%', '').astype(float)

def parse_bandwidth(bandwidth_str):
    if 'Mbps' in bandwidth_str:
        return float(bandwidth_str.replace(' Mbps', ''))
    elif 'Kbps' in bandwidth_str:
        return float(bandwidth_str.replace(' Kbps', '')) / 1000
    else:
        return float(bandwidth_str)
    
df['Required Bandwidth'] = df['Required Bandwidth'].apply(parse_bandwidth)
df['Allocated Bandwidth'] = df['Allocated Bandwidth'].apply(parse_bandwidth)

In [38]:
corr_matrix = df.corr(numeric_only=True)

# Visualize the correlation matrix using Plotly
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu',
    title='Correlation Matrix'
)

fig.show()

In [32]:
# Plot some visualizations

# Histogram for Application types

fig = px.histogram(df, x='Application Type', 
                   title='Distribution of Application Types', 
                   nbins=30, 
                   color='Application Type')
fig.update_layout(
    yaxis_title='Count',
    bargap=0.2)
fig.show()

In [33]:
# Relationship between required bandwidth and allocated bandwidth

fig = px.scatter(df, x='Required Bandwidth', y='Allocated Bandwidth', color='Application Type', title='Required vs Allocated Bandwidth')
fig.update_layout(
    xaxis_title='Required Bandwidth (Mbps)',
    yaxis_title='Allocated Bandwidth (Mbps)'
)
fig.update_xaxes(tickangle=90)
fig.show()

In [37]:
fig = px.scatter(df, x = 'Signal Strength', y = 'Latency', color='Application Type', title='Signal Strength vs Latency')
fig.update_layout(
    xaxis_title='Signal Strength (dBm)',
    yaxis_title='Latency (ms)'
)
fig.show()

In [34]:
# Latency distribution by Application Type
fig = px.box(df, x='Application Type', y='Latency', color='Application Type', title='Latency Distribution by Application Type')
fig.update_layout(
    yaxis_title='Latency (ms)',
    xaxis_title='Application Type'
)
fig.update_xaxes(tickangle=45)
fig.show()