In [1]:
import pandas as pd
import plotly.express as px

In [2]:
# Load dataset
df = pd.read_csv("../data/raw/Quality of Service 5G.csv")

# 5 random samples from the dataset
print(df.sample(5))

          Timestamp   User_ID Application_Type Signal_Strength Latency  \
21   9/3/2023 10:00   User_22    Online_Gaming         -80 dBm   23 ms   
338  9/3/2023 10:05  User_339    Online_Gaming         -48 dBm   40 ms   
175  9/3/2023 10:03  User_176     Web_Browsing         -98 dBm   22 ms   
133  9/3/2023 10:02  User_134        Streaming         -75 dBm   30 ms   
275  9/3/2023 10:04  User_276        Streaming         -95 dBm   46 ms   

    Required_Bandwidth Allocated_Bandwidth Resource_Allocation  
21            2.2 Mbps            2.5 Mbps                 80%  
338           5.6 Mbps            5.9 Mbps                 80%  
175           0.5 Mbps            0.5 Mbps                 70%  
133           5.0 Mbps            5.7 Mbps                 85%  
275           2.8 Mbps            3.3 Mbps                 85%  


In [5]:
df.describe()

Unnamed: 0,Signal Strength,Latency,Required Bandwidth,Allocated Bandwidth,Resource Allocation
count,400.0,400.0,400.0,400.0,400.0
mean,-80.495,33.825,3.135512,3.50238,74.7125
std,20.701119,21.122139,3.984097,4.460801,8.982291
min,-123.0,0.0,0.0,0.0,50.0
25%,-98.0,21.75,0.4175,0.4175,70.0
50%,-83.0,31.0,1.2,1.35,70.0
75%,-64.0,45.0,4.1,4.425,80.0
max,-40.0,110.0,14.5,15.8,90.0


In [3]:
# Remove the  "_" from column names
df.columns = df.columns.str.replace('_', ' ')

# Remove "_" in Application Type values
df['Application Type'] = df['Application Type'].str.replace('_', ' ')

In [4]:
# Drop redundant columns
df = df.drop(columns=["Timestamp", "User ID"])

# Convert object columns to numerical
df['Signal Strength'] = df['Signal Strength'].str.replace(' dBm', '').astype(float)
df['Latency'] = df['Latency'].str.replace(' ms', '').astype(float)
df['Resource Allocation'] = df['Resource Allocation'].str.replace('%', '').astype(float)

def parse_bandwidth(bandwidth_str):
    if 'Mbps' in bandwidth_str:
        return float(bandwidth_str.replace(' Mbps', ''))
    elif 'Kbps' in bandwidth_str:
        return float(bandwidth_str.replace(' Kbps', '')) / 1000
    else:
        return float(bandwidth_str)
    
df['Required Bandwidth'] = df['Required Bandwidth'].apply(parse_bandwidth)
df['Allocated Bandwidth'] = df['Allocated Bandwidth'].apply(parse_bandwidth)

In [None]:
corr_matrix = df.corr(numeric_only=True)

# Visualize the correlation matrix using Plotly
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu',
    title='Correlation Matrix'
)

fig.show()

In [None]:
# Plot some visualizations

# Histogram for Application types

fig = px.histogram(df, x='Application Type', 
                   title='Distribution of Application Types', 
                   nbins=30, 
                   color='Application Type')
fig.update_layout(
    yaxis_title='Count',
    bargap=0.2)
fig.show()

In [None]:
# Relationship between required bandwidth and allocated bandwidth

fig = px.scatter(df, x='Required Bandwidth', y='Allocated Bandwidth', color='Application Type', title='Required vs Allocated Bandwidth')
fig.update_layout(
    xaxis_title='Required Bandwidth (Mbps)',
    yaxis_title='Allocated Bandwidth (Mbps)'
)
fig.update_xaxes(tickangle=90)
fig.show()

In [None]:
fig = px.scatter(df, x = 'Signal Strength', y = 'Latency', color='Application Type', title='Signal Strength vs Latency')
fig.update_layout(
    xaxis_title='Signal Strength (dBm)',
    yaxis_title='Latency (ms)'
)
fig.show()

In [None]:
# Latency distribution by Application Type
fig = px.box(df, x='Application Type', y='Latency', color='Application Type', title='Latency Distribution by Application Type')
fig.update_layout(
    yaxis_title='Latency (ms)',
    xaxis_title='Application Type'
)
fig.update_xaxes(tickangle=45)
fig.show()

In [None]:
# Resource Allocation distribution
fig = px.histogram(df, x='Resource Allocation', nbins=30, title='Resource Allocation Distribution', color_discrete_sequence=['indianred'])
fig.update_layout(
    xaxis_title='Resource Allocation (%)',
    yaxis_title='Count',
    bargap=0.2
)

In [None]:
# Resource Allocation by Application Type
fig = px.box(df, x='Application Type', y='Resource Allocation', color='Application Type', title='Resource Allocation by Application Type')
fig.update_layout(
    yaxis_title='Resource Allocation (%)',
    xaxis_title='Application Type'
)
fig.update_xaxes(tickangle=45)
fig.show()

In [None]:
# Resouce Allocation vs Latency
fig = px.scatter(df, x='Resource Allocation', y='Latency', color='Application Type', title='Resource Allocation vs Latency')
fig.update_layout(
    xaxis_title='Resource Allocation (%)',
    yaxis_title='Latency (ms)'
)
fig.show()

In [13]:
import joblib

scaler = joblib.load('../models/scaler.pkl')

In [14]:
scaler.data_max_

array([-40. , 110. ,  14.5,  15.8,  90. ])

In [15]:
scaler.data_min_

array([-123.,    0.,    0.,    0.,   50.])

In [16]:
train_df = pd.read_csv("../data/processed/train_data.csv")
train_df.describe()

Unnamed: 0,Application Type,Signal Strength,Latency,Required Bandwidth,Allocated Bandwidth,Resource Allocation
count,504.0,504.0,504.0,504.0,504.0,504.0
mean,4.055556,0.469664,0.454527,0.17087,0.17443,0.505456
std,2.952387,0.219526,0.271395,0.257842,0.264806,0.324654
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.325301,0.254545,0.005517,0.006329,0.25
50%,4.0,0.385542,0.409091,0.041379,0.037975,0.5
75%,6.0,0.60241,0.538636,0.248276,0.253165,0.75
max,10.0,1.0,1.0,1.0,1.0,1.0


In [17]:
numeric_cols = [col for col in train_df.columns if not col.startswith('Application')]

# original_df hiện tại là np.array
original_array = scaler.inverse_transform(train_df[numeric_cols])

# Chuyển sang DataFrame với tên cột tương ứng
original_df = pd.DataFrame(original_array, columns=numeric_cols)

# Nếu muốn ghép trở lại với các cột one-hot không scale:
final_df = pd.concat(
    [original_df, train_df[[c for c in train_df.columns if c.startswith("Application")]]],
    axis=1
)

final_df.describe()

Unnamed: 0,Signal Strength,Latency,Required Bandwidth,Allocated Bandwidth,Resource Allocation,Application Type
count,504.0,504.0,504.0,504.0,504.0,504.0
mean,-84.017857,49.998016,2.477619,2.755992,70.218254,4.055556
std,18.220695,29.853419,3.738704,4.183927,12.986152,2.952387
min,-123.0,0.0,0.0,0.0,50.0,0.0
25%,-96.0,28.0,0.08,0.1,60.0,1.0
50%,-91.0,45.0,0.6,0.6,70.0,4.0
75%,-73.0,59.25,3.6,4.0,80.0,6.0
max,-40.0,110.0,14.5,15.8,90.0,10.0
