# Data Exploration and Preprocessing

In this notebook, we will load the dataset, explore it, and perform necessary preprocessing tasks.

In [1]:
# Import necessary libraries
import pandas as pd

from src.utils import fetch_data_from_db
from src.preprocessing import preprocess_data
from src.utils import save_data_to_db

## Load the Data

We will load the data from postgres database into a pandas DataFrame.

In [2]:
query = "SELECT * FROM xdr_data"
df = fetch_data_from_db(query)

## Data Overview

In [3]:
# Print the first few rows of the DataFrame
df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,4/9/2019 13:04,235.0,4/25/2019 8:15,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,4/9/2019 17:42,1.0,4/25/2019 11:58,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,4/12/2019 20:10,565.0,4/25/2019 10:40,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [4]:
# Print the shape of the DataFrame
df.shape

(150001, 55)

In [5]:
# Check the DataFrame's information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        148848 non-null  object 
 10  Avg RTT DL (ms)     

In [6]:
# Describe the DataFrame
df.describe()

Unnamed: 0,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
count,149010.0,150000.0,150000.0,150000.0,149431.0,148935.0,149429.0,122172.0,122189.0,150000.0,...,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150000.0,150000.0
mean,1.013887e+19,499.1882,498.80088,104608.6,208201600000000.0,41882820000.0,48474550000000.0,109.795706,17.662883,13300.045927,...,11634070.0,11009410.0,11626850.0,11001750.0,422044700.0,8288398.0,421100500.0,8264799.0,41121210.0,454643400.0
std,2.893173e+18,288.611834,288.097653,81037.62,21488090000.0,2447443000000.0,22416370000000.0,619.782739,84.793524,23971.878541,...,6710569.0,6345423.0,6725218.0,6359490.0,243967500.0,4782700.0,243205000.0,4769004.0,11276390.0,244142900.0
min,6.917538e+18,0.0,0.0,7142.0,204047100000000.0,33601000000.0,440015200000.0,0.0,0.0,0.0,...,53.0,105.0,42.0,35.0,2516.0,59.0,3290.0,148.0,2866892.0,7114041.0
25%,7.349883e+18,250.0,251.0,57440.5,208201400000000.0,33651300000.0,35460710000000.0,32.0,2.0,43.0,...,5833501.0,5517965.0,5777156.0,5475981.0,210473300.0,4128476.0,210186900.0,4145943.0,33222010.0,243106800.0
50%,7.349883e+18,499.0,500.0,86399.0,208201500000000.0,33663710000.0,35722010000000.0,45.0,5.0,63.0,...,11616020.0,11013450.0,11642220.0,10996380.0,423408100.0,8291208.0,421803000.0,8267071.0,41143310.0,455841100.0
75%,1.304243e+19,749.0,750.0,132430.2,208201800000000.0,33683490000.0,86119700000000.0,70.0,15.0,19710.75,...,17448520.0,16515560.0,17470480.0,16507270.0,633174200.0,12431620.0,631691800.0,12384150.0,49034240.0,665705500.0
max,1.318654e+19,999.0,999.0,1859336.0,214074300000000.0,882397100000000.0,99001200000000.0,96923.0,7120.0,378160.0,...,23259100.0,22011960.0,23259190.0,22011960.0,843441900.0,16558790.0,843442500.0,16558820.0,78331310.0,902969600.0


In [7]:
# Check for missing values
df.isnull().sum()

Bearer Id                                      991
Start                                            1
Start ms                                         1
End                                              1
End ms                                           1
Dur. (ms)                                        1
IMSI                                           570
MSISDN/Number                                 1066
IMEI                                           572
Last Location Name                            1153
Avg RTT DL (ms)                              27829
Avg RTT UL (ms)                              27812
Avg Bearer TP DL (kbps)                          1
Avg Bearer TP UL (kbps)                          1
TCP DL Retrans. Vol (Bytes)                  88146
TCP UL Retrans. Vol (Bytes)                  96649
DL TP < 50 Kbps (%)                            754
50 Kbps < DL TP < 250 Kbps (%)                 754
250 Kbps < DL TP < 1 Mbps (%)                  754
DL TP > 1 Mbps (%)             

In [8]:
# Check for duplicates
df.duplicated().sum()

0

## User Overview analysis

An overview of the user's behavior on the applications


### Top 10 handsets used by the customers

In [9]:
# Identify the top 10 handsets used by the customers
top_10_handsets = df['Handset Type'].value_counts().head(10)
print(top_10_handsets)

Huawei B528S-23A                19752
Apple iPhone 6S (A1688)          9419
Apple iPhone 6 (A1586)           9023
undefined                        8987
Apple iPhone 7 (A1778)           6326
Apple iPhone Se (A1723)          5187
Apple iPhone 8 (A1905)           4993
Apple iPhone Xr (A2105)          4568
Samsung Galaxy S8 (Sm-G950F)     4520
Apple iPhone X (A1901)           3813
Name: Handset Type, dtype: int64


### Top 3 handset manufacturers

In [10]:
# Identify the top 3 handset manufacturers
top_3_manufacturers = df['Handset Manufacturer'].value_counts().head(3)
print(top_3_manufacturers)

Apple      59565
Samsung    40839
Huawei     34423
Name: Handset Manufacturer, dtype: int64


In [11]:
# Identify the top 5 handsets per top 3 handset manufacturer
top_5_handsets_per_manufacturer = df[df['Handset Manufacturer'].isin(top_3_manufacturers.index)].groupby('Handset Manufacturer')['Handset Type'].value_counts().groupby(level=0).head(5)
print(top_5_handsets_per_manufacturer)

Handset Manufacturer  Handset Type                  
Apple                 Apple iPhone 6S (A1688)            9419
                      Apple iPhone 6 (A1586)             9023
                      Apple iPhone 7 (A1778)             6326
                      Apple iPhone Se (A1723)            5187
                      Apple iPhone 8 (A1905)             4993
Huawei                Huawei B528S-23A                  19752
                      Huawei E5180                       2079
                      Huawei P20 Lite Huawei Nova 3E     2021
                      Huawei P20                         1480
                      Huawei Y6 2018                      997
Samsung               Samsung Galaxy S8 (Sm-G950F)       4520
                      Samsung Galaxy A5 Sm-A520F         3724
                      Samsung Galaxy J5 (Sm-J530)        3696
                      Samsung Galaxy J3 (Sm-J330)        3484
                      Samsung Galaxy S7 (Sm-G930X)       3199
Name: Handset Typ

In [12]:
categorical_features = ['Handset Manufacturer', 'Handset Type']

## Data Preprocessing

We will preprocess the data by performing the following tasks:
- Handle missing values
- Encode categorical variables
- Scale numerical variables

In [13]:
# Define the columns to be used in the preprocessing
timestamp_cols = ['End', 'Start']
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Perform data preprocessing
clean_data = preprocess_data(df, timestamp_cols, numeric_features, None)

In [14]:
# Convert the transformed data back to a DataFrame
clean_data_df = pd.DataFrame(clean_data)


In [15]:
# compare the first few rows of the original and transformed data
df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Total UL (Bytes),Total DL (Bytes),End_hour,End_day_of_week,End_day_of_month,End_month,Start_hour,Start_day_of_week,Start_day_of_month,Start_month
0,1.311448e+19,2019-04-04 12:01:00,770.0,2019-04-25 14:35:00,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,36749741.0,308879636.0,14.0,3.0,25.0,4.0,12.0,3.0,4.0,4.0
1,1.311448e+19,2019-04-09 13:04:00,235.0,2019-04-25 08:15:00,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,53800391.0,653384965.0,8.0,3.0,25.0,4.0,13.0,1.0,9.0,4.0
2,1.311448e+19,2019-04-09 17:42:00,1.0,2019-04-25 11:58:00,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,27883638.0,279807335.0,11.0,3.0,25.0,4.0,17.0,1.0,9.0,4.0
3,1.311448e+19,2019-04-10 00:31:00,486.0,2019-04-25 07:36:00,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,43324218.0,846028530.0,7.0,3.0,25.0,4.0,0.0,2.0,10.0,4.0
4,1.311448e+19,2019-04-12 20:10:00,565.0,2019-04-25 10:40:00,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,38542814.0,569138589.0,10.0,3.0,25.0,4.0,20.0,4.0,12.0,4.0


In [16]:

# Display the first few rows of the DataFrame
clean_data_df.head()

Unnamed: 0,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.031911,0.938332,0.566475,21.213047,-0.008932,-0.00337,-0.578959,-0.1212064,-0.165464,-0.553863,...,0.628941,-1.340826,-0.509712,-0.211575,-0.59009,1.266183,-1.025295,0.115243,-0.387668,-0.597047
1,1.031911,-0.915382,0.372095,15.554551,0.012569,-0.003363,-0.566766,-0.08008661,-0.165464,-0.554155,...,1.28355,1.27688,0.997973,0.978915,0.76529,-1.488221,0.435041,1.423855,1.124409,0.814043
2,1.031911,-1.726165,0.531765,15.513311,-0.061789,-0.003331,-0.589672,-2.5406440000000003e-17,0.0,-0.554572,...,1.205802,0.58155,0.88636,-0.76081,-0.788879,-1.65028,-0.042795,-0.849035,-1.173927,-0.716127
3,1.031911,-0.045696,-1.137819,15.016588,-0.011065,-0.003335,-0.586315,-2.5406440000000003e-17,0.0,-0.552987,...,1.45354,0.652005,0.352078,-1.557333,1.547316,0.535541,1.348412,0.950408,0.195366,1.603109
4,1.031911,0.22803,1.580027,12.147531,-0.011108,-0.003355,-0.584063,-2.5406440000000003e-17,0.0,-0.554572,...,0.54024,1.253421,0.817482,-1.664689,0.433102,-0.994964,0.532922,1.183799,-0.228656,0.468971


## check the cleaned data

In [21]:
# Check the DataFrame's information
clean_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 50 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 150001 non-null  float64
 1   Start ms                                  150001 non-null  float64
 2   End ms                                    150001 non-null  float64
 3   Dur. (ms)                                 150001 non-null  float64
 4   IMSI                                      150001 non-null  float64
 5   MSISDN/Number                             150001 non-null  float64
 6   IMEI                                      150001 non-null  float64
 7   Avg RTT DL (ms)                           150001 non-null  float64
 8   Avg RTT UL (ms)                           150001 non-null  float64
 9   Avg Bearer TP DL (kbps)                   150001 non-null  float64
 10  Avg Bearer TP UL (kb

In [22]:
# Describe the DataFrame
clean_data_df.describe()

Unnamed: 0,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
count,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,...,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0
mean,7.489071e-17,1.620027e-17,-2.2926690000000002e-17,1.212652e-17,-1.090168e-13,-2.131614e-18,1.085702e-16,-1.932663e-17,1.33818e-17,2.804257e-17,...,-1.233257e-16,-4.4669160000000004e-17,8.095988000000001e-17,9.113834e-17,2.842152e-18,-1.33818e-16,-8.313295e-17,2.385039e-16,8.730144000000001e-17,-4.3437560000000005e-17
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,...,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,-1.117127,-1.729629,-1.731372,-1.20274,-193.7101,-0.003395965,-2.146937,-0.1962948,-0.2307978,-0.5548224,...,-1.733692,-1.735005,-1.728843,-1.729975,-1.729917,-1.732989,-1.731456,-1.732999,-3.392449,-1.833076
25%,-0.9671942,-0.8634083,-0.8601338,-0.5820368,-0.01111507,-0.003375254,-0.5815777,-0.1337212,-0.1915973,-0.5530286,...,-0.8643963,-0.8654211,-0.8698179,-0.868905,-0.8672145,-0.8697883,-0.8672287,-0.8636752,-0.7005107,-0.8664503
50%,-0.9671942,-0.0006520913,0.004162228,-0.2247065,-0.004349883,-0.003370215,-0.5699658,-0.09975262,-0.13933,-0.5521943,...,-0.002690318,0.0006361875,0.002284734,-0.0008445391,0.005588474,0.0005875131,0.002888362,0.0004763224,0.001954159,0.004904869
75%,1.006922,0.865569,0.8719293,0.3433174,0.006152595,-0.003361632,1.682577,-2.5406440000000003e-17,0.0,0.2673965,...,0.8664638,0.8677388,0.8689154,0.8657191,0.8654029,0.8662973,0.865903,0.8637784,0.7017394,0.8645066
max,1.0569,1.73179,1.736225,21.65339,273.8202,361.8103,2.258324,173.0845,92.805,15.22043,...,1.732352,1.733941,1.729665,1.731308,1.727274,1.729237,1.736573,1.739157,3.299847,1.836339


In [23]:
# Check for missing values
clean_data_df.isnull().sum()

Bearer Id                                   0
Start ms                                    0
End ms                                      0
Dur. (ms)                                   0
IMSI                                        0
MSISDN/Number                               0
IMEI                                        0
Avg RTT DL (ms)                             0
Avg RTT UL (ms)                             0
Avg Bearer TP DL (kbps)                     0
Avg Bearer TP UL (kbps)                     0
TCP DL Retrans. Vol (Bytes)                 0
TCP UL Retrans. Vol (Bytes)                 0
DL TP < 50 Kbps (%)                         0
50 Kbps < DL TP < 250 Kbps (%)              0
250 Kbps < DL TP < 1 Mbps (%)               0
DL TP > 1 Mbps (%)                          0
UL TP < 10 Kbps (%)                         0
10 Kbps < UL TP < 50 Kbps (%)               0
50 Kbps < UL TP < 300 Kbps (%)              0
UL TP > 300 Kbps (%)                        0
HTTP DL (Bytes)                   

In [24]:
# Check for duplicates
clean_data_df.duplicated().sum()

0

In [25]:
# Check the shape of the DataFrame
clean_data_df.shape

(150001, 50)

### Save the Cleaned Data to Database

In [17]:
save_data_to_db(clean_data_df, 'clean_data')