In [None]:
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import numpy as np

## Reading In Data

In [None]:
DATASETS_FOLDER_PATH = "C:/Users/Killian/Desktop/FYP-Multistage-Throughput-Predictor/Datasets/Raw/"
raw_data_path = DATASETS_FOLDER_PATH + "all_4G_data.csv"

raw_data = pd.read_csv(raw_data_path, index_col=None)

## Formatting Data

In [None]:
raw_data = raw_data.replace({"-":np.nan})

In [None]:
raw_data["Timestamp"] = pd.to_datetime(raw_data["Timestamp"], format="%Y.%m.%d_%H.%M.%S")
raw_data[["RSRQ", "SNR", "CQI", "RSSI", "NRxRSRP", "NRxRSRQ", "ServingCell_Distance", "ServingCell_Lat", "ServingCell_Lon"]] = raw_data[["RSRQ", "SNR", "CQI", "RSSI", "NRxRSRP", "NRxRSRQ", "ServingCell_Distance", "ServingCell_Lat", "ServingCell_Lon"]].astype(float)

In [None]:
raw_data.dtypes

## Feature Explaination

While the dataset contains many potentially valuable features it is important to note how the dataset was constructed.
Most of the columns were collected directly from the device using the G-NetTrack App for android.
The following columns are widely reported on in the G-NetTrack App and thus measurements are available consistently
This includes:
 - Longitude
 - Latitude
 - Speed
 - Operatorname
 - NetworkMode
 - RSRP
 - RSRQ
 - SNR
 - CQI
 - DL_bitrate
 - UL_bitrate
 - State
 - NRxRSRP (Many missing values)
 - NRxRSRQ (Many missing values)

NRxRSRP and NRxRSRQ may not be subject to missing at random assumption as it is possible that there are no other serving towers nearby when nan is reported.

## Exploring Missing Values

In [None]:
msno.bar(raw_data.drop(columns=["movement_type", "session"]))

In [None]:
nan_prop = raw_data.isna().mean()
print(nan_prop)

In [None]:
rssi_ordered_raw_data = raw_data.copy().sort_values("RSSI")
msno.matrix(rssi_ordered_raw_data.drop(columns=["session", "movement_type"]))

In [None]:
msno.heatmap(raw_data.drop(columns=["session", "movement_type"]))

## Distribution of Features

In [None]:
raw_data.hist(figsize=(20,20))

## Checking for Normality

In [None]:
from scipy.stats import shapiro
from statsmodels.graphics.gofplots import qqplot

In [None]:
def check_for_normality(dataframe=pd.DataFrame()):
    numeric_features = ["SNR", "CQI", "RSSI", "NRxRSRP", \
    "NRxRSRQ", "RSRQ", "RSRP", "DL_bitrate", "UL_bitrate"]
    for feature in numeric_features:
        print("Column", feature)
        stat, p = shapiro(dataframe[feature].to_numpy())
        print('Statistics=%.3f, p=%.3f' % (stat, p))
        if p > 0.05:
            print('{} looks Gaussian (fail to reject H0)'.format(feature), "\n===============\n")
        else:
            print('{} does not look Gaussian (reject H0)'.format(feature), "\n===============\n")


In [None]:
check_for_normality(raw_data)

NaN values cause this function not to work properly therefore we must impute first

## Checking for Outliers

In [None]:
def check_for_outliers(dataframe=pd.DataFrame()):
    numeric_features = ["SNR", "CQI", "RSSI", "NRxRSRP", \
    "NRxRSRQ", "RSRQ", "RSRP", "DL_bitrate", "UL_bitrate"]
    for feature in numeric_features:
        print("Column", feature)
        q1 = dataframe[feature].quantile(0.25)
        q3 = dataframe[feature].quantile(0.75)
        iqr = q3 - q1
        df_outliers = dataframe[(dataframe[feature] < q1 - 1.5*iqr) | (dataframe[feature] > q3 + 1.5*iqr)][feature]
        print("No of outliers:", df_outliers.size)
        print(df_outliers, "\n===============\n\n")


## Imputing the Missing Values

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

In [None]:
complete_data = pd.read_csv("C:\\Users\\Killian\Desktop\\FYP-Multistage-Throughput-Predictor\\Datasets\\unaveraged_processed_network_data.csv", index_col=None)

## Checking for Normality in Imputed Data

In [None]:
check_for_normality(complete_data)

## Checking for Outliers in Imputed Data

In [None]:
len(complete_data)

In [None]:
check_for_outliers(complete_data)