# Data Exploration & Visualization

## Importing Libraries

In [None]:
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import numpy as np

## Reading in Data

In [None]:
DATASETS_FOLDER_PATH = "C:/Users/Killian/Desktop/FYP-Multistage-Throughput-Predictor/Datasets/Raw/"
raw_data_path = DATASETS_FOLDER_PATH + "all_4G_data.csv"

raw_data = pd.read_csv(raw_data_path, index_col=None)

## Formatting Data

In [None]:
raw_data = raw_data.replace({"-":np.nan})

In [None]:
raw_data.dtypes

In [None]:
raw_data["Timestamp"] = pd.to_datetime(raw_data["Timestamp"], format="%Y.%m.%d_%H.%M.%S")

In [None]:
raw_data[["RSRQ", "SNR", "CQI", "RSSI", "NRxRSRP", "NRxRSRQ", "ServingCell_Distance", "ServingCell_Lat", "ServingCell_Lon"]] = raw_data[["RSRQ", "SNR", "CQI", "RSSI", "NRxRSRP", "NRxRSRQ", "ServingCell_Distance", "ServingCell_Lat", "ServingCell_Lon"]].astype(float)

In [None]:
raw_data.dtypes

# Dataset Understanding

While the dataset contains many potentially valuable features it is important to note how the dataset was constructed.
Most of the columns were collected directly from the device using the G-NetTrack App for android.
The following columns are widely reported on in the G-NetTrack App and thus measurements are available consistently
This includes:
 - Longitude
 - Latitude
 - Speed
 - Operatorname
 - NetworkMode
 - RSRP
 - RSRQ (VERY FEW DEVICES SUPPORT REPORTING THIS VALUE)
 - SNR
 - CQI
 - DL_bitrate
 - UL_bitrate
 - State
 - NRxRSRP (Many missing values)
 - NRxRSRQ (Many missing values)

NRxRSRP and NRxRSRQ may not be subject to missing at random assumption as it is possible that there are no other serving towers nearby when nan is reported.

## Checking For Missing Values

In [None]:
msno.bar(raw_data)

In [None]:
msno.matrix(raw_data)

In [None]:
rssi_ordered_raw_data = raw_data.copy().sort_values("RSSI")
msno.matrix(rssi_ordered_raw_data)

- There seems to be a correlation between missing values for SNR, CQI, RSSI, ServingCell_long, ServingCell_Lat, ServingCell_Distance.
- When the above features are missing it is likely that there exists a nearby tower as seen by consistent values for NRxRSRP and NRxRSRQ
- It seems that these features stop being reported on the edge of a serving cells range and as such the mobile device will be swapping serving cells.

## Examining values when servingcell is possibly changing

In [None]:
# edge_values = raw_data[raw_data["RSSI"].isna()]
# mean_dl = round(edge_values["DL_bitrate"].mean(), 2)
# var_dl = round(edge_values["DL_bitrate"].var(), 2)
# sd_dl = round(edge_values["DL_bitrate"].std(), 2)
# print("Possible transition phase mean:",mean_dl)
# print("Standard dev:", sd_dl)
# print("Variance:", var_dl, "\n---------------\n")
# global_mean = round(raw_data["DL_bitrate"].mean(), 2)
# global_var = round(raw_data["DL_bitrate"].var(), 2)
# global_std = round(raw_data["DL_bitrate"].std(), 2)
# print("Global mean:", global_mean)
# print("Global std:", global_std)
# print("Global variance:", global_var)

# print(edge_values["CQI"].max())

raw_data["SNR"].fillna(raw_data["SNR"].min(), inplace=True)
print("In SNR place")
print(raw_data["SNR"].isnull().sum())
print(raw_data["SNR"].min())

In [None]:
mb = raw_data[raw_data["movement_type"]=="train"]["DL_bitrate"]/1024
global_mean = round(mb.mean(), 2)
global_std = round(mb.std(), 2)
max_valu = round(mb.max(), 2)
med = round(mb.median(), 2)
print("mean", global_mean)
print("std", global_std)
print("med", med)
print("max", max_valu)
print("")


## Takeaways for Imputation

- SNR, CQI, RSSI only missing when servingcell is being changed. These situations should have the minimum (worst quality) value imputed.
- Serving cell distance is should have the maximum distance imputed. However this feature itself is rarely reported by the device anyway and should probably be excluded
- NRxRSRP and NRxRSRQ are far more likely to be missing when the mobile device has a solid connection to its servingcell. As such possible makes sense to impute based solely on data when RSSI is not NaN

## Distribution of Columns

In [None]:
raw_data.hist(figsize=(20,20))

In [None]:
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2, ncols=2)
fig.tight_layout(pad=6.0)
axes = [ax1,ax2,ax3,ax4]
accum = 0
for i in raw_data["movement_type"].unique():
    ax = axes[accum]
    movement = raw_data[raw_data["movement_type"]==i]
    random_trace = movement["session"].sample().to_list()[0]
    ax.plot(movement[movement["session"]==random_trace]["DL_bitrate"]/1024)
    ax.set_ylabel("Bitrate (Mbps)")
    ax.set_xlabel("Time (seconds)")
    ax.set_title("Sample from {}".format(i.capitalize()))
    ax.set_xticks([])
    accum+=1 
    if accum == 4:
        break


# train

In [None]:
fig, ax = plt.subplots()
ax.hist(raw_data["DL_bitrate"]/1024,bins=200, range=(0,100000/1024))
ax.set_ylabel("Sample Oberservations")
ax.set_xlabel("Bitrate (Mbps)")
ax.set_title("Histogram of DL_throughput")

In [None]:
traces = raw_data.groupby("session")
for name, trace in traces:
    

In [None]:
fig, ax = plt.subplots()
ax.hist(raw_data["RSRQ"],bins=50)
ax.set_ylabel("Sample Oberservations")
ax.set_xlabel("RSRQ")
ax.set_title("Histogram of RSRQ")

In [None]:
fig, ax = plt.subplots()
ax.hist(raw_data["NRxRSRP"],bins=50)
ax.set_ylabel("Sample Oberservations")
ax.set_xlabel("NRxRSRP")
ax.set_title("Histogram of NRxRSRP")

In [None]:
counts = []
groups = raw_data.groupby("session")
for name, group in groups:
    counts.append(len(group))
mean = np.mean(counts)
median = np.median(counts)
max_length = np.argmax(counts)

In [None]:
fig, ax = plt.subplots()
ax.plot(list(range(135)), counts, )
ax.set_ylabel("Trace Length in Seconds")
ax.set_xlabel("Trace")
ax.set_ylim(0, 14000)
ax.axhline(mean, linestyle='--', color='r')
y_ticks = ax.get_yticks()
accum = 0
for i in y_ticks:
    if i > mean:
        break
    accum += 1 
y_ticks = list(y_ticks[:accum]) + [mean] + list(y_ticks[accum:])
ax.set_yticks(y_ticks)
ax.set_title("Mean={} Mins, Median={} Mins".format(round(mean/60, 2),round(median/60, 2)))


In [None]:
max_length

In [None]:
raw_data[raw_data["session"]==116]["movement_type"]

In [None]:
print(raw_data["CellID"].sort_values())

## Time Series Analysis

### Static Instances

In [None]:
static_data = raw_data.copy()[raw_data["movement_type"] == "static"]

In [None]:
print(static_data["session"].sort_values())

Graph for just one session

In [None]:
x_points = static_data[(static_data["session"]==100)]["Timestamp"]
y_points = static_data[(static_data["session"]==100)]["DL_bitrate"]
fig, ax = plt.subplots()
ax.scatter(x_points, y_points)
ax.set_xlabel("Time")
ax.set_ylabel("Download Throughput in kbps")
plt.show()

In [None]:
for i in range(100,115):
    x_points = static_data[(static_data["session"]==i)]["Timestamp"]
    y_points = static_data[(static_data["session"]==i)]["DL_bitrate"]
    fig, ax = plt.subplots()
    ax.scatter(x_points, y_points)
    ax.set_xlabel("Time")
    ax.set_ylabel("Download Throughput in bps")
    ax.set_title(str(i))
    plt.show()

### Some Observations
- There is a large variance the the download speeds between sessions.
- Assuming the workload was the same in each instance, this could be due to session taking places in different places, or a different times (e.g. peak user times might cause slower LTE speeds for 1 user)
- There is no clear overall pattern between sessions.

## Attempting to Identify trends Associated with Particular Cell Towers or Locations

In [None]:
cell_tower_groups = raw_data.copy().groupby(["ServingCell_Lat", "ServingCell_Lon"])

In [None]:
print(len(cell_tower_groups))

In [None]:
tower_grouped_data = pd.DataFrame()
i = 0
for name, group in cell_tower_groups:
    group["cell_tower"] = i
    tower_grouped_data = pd.concat([tower_grouped_data, group])

In [None]:
print(len(pd.unique(tower_grouped_data["session"].sort_values())))

In [None]:
for i in range(132):
    tower = tower_grouped_data[(tower_grouped_data["cell_tower"]==i)]
    for session in pd.unique(tower["session"].sort_values()):
        x_points = tower[(tower["session"]==session)]["Timestamp"]
        y_points = tower[(tower["session"]==session)]["DL_bitrate"]
        fig, ax = plt.subplots()
        ax.scatter(x_points, y_points)
        ax.set_xlabel("Time")
        ax.set_ylabel("Download Throughput in bps")
        ax.set_title(y_points.sum())
        plt.show()
    break

## Feature Correlation Analysis

In [None]:
import configparser
import sys
import tensorflow as tf
import pandas as pd
from keras.callbacks import ModelCheckpoint, TensorBoard

config = configparser.ConfigParser()
config.read('C:\\Users\\Killian\Desktop\\FYP-Multistage-Throughput-Predictor\\.env')
print(config.sections())

module_path = config['global']['MODULE_PATH']
project_path = config['global']['PROJECT_PATH']
sys.path.append(module_path)

from models.simple_LSTM import SimpleLSTM