# Técnicas de Limpeza e Tratamento de Valores Ausentes para Análise de Dados.

## Pacotes Python usados no projeto

In [1]:
!pip install -q -U watermark

In [2]:
# imports
import math
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import warnings
warnings.filterwarnings('ignore')

In [5]:
%reload_ext watermark
%watermark -a "Leonardo da Silva Neves"

Author: Leonardo da Silva Neves



## Carregando os dados
https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

In [3]:
# Criando uma lista para identificar possíveis representações para valores ausentes
lista_labels_valores_ausentes = ["n/a", "na", "undefined"]

In [4]:
# Carregando o dataset com valores ausentes
# O parâmetro na_values identifica os argumentos de entrada como valores NaN. A ultilização
# deste parâmetro é valida quando se sabe de ante
dataset_dsa = pd.read_csv("dataset.csv", na_values= lista_labels_valores_ausentes)

In [8]:
# Shape
dataset_dsa.shape

(150001, 55)

In [10]:
# Amostra das primeiras colunas (Algumas colunas aparecem 'truncadas')
dataset_dsa.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,4/9/2019 13:04,235.0,4/25/2019 8:15,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,4/9/2019 17:42,1.0,4/25/2019 11:58,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,4/12/2019 20:10,565.0,4/25/2019 10:40,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [5]:
# Alteração de parâmetro do Pandas "display.max_columns" para almentar o número de colunas exibidas ao imprimir o dataframe.
pd.set_option('display.max_columns', 100)
# Caso o df tenha mais de 100 colunaas, o que passar será trucado.

In [13]:
dataset_dsa.tail()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),Avg Bearer TP UL (kbps),TCP DL Retrans. Vol (Bytes),TCP UL Retrans. Vol (Bytes),DL TP < 50 Kbps (%),50 Kbps < DL TP < 250 Kbps (%),250 Kbps < DL TP < 1 Mbps (%),DL TP > 1 Mbps (%),UL TP < 10 Kbps (%),10 Kbps < UL TP < 50 Kbps (%),50 Kbps < UL TP < 300 Kbps (%),UL TP > 300 Kbps (%),HTTP DL (Bytes),HTTP UL (Bytes),Activity Duration DL (ms),Activity Duration UL (ms),Dur. (ms).1,Handset Manufacturer,Handset Type,Nb of sec with 125000B < Vol DL,Nb of sec with 1250B < Vol UL < 6250B,Nb of sec with 31250B < Vol DL < 125000B,Nb of sec with 37500B < Vol UL,Nb of sec with 6250B < Vol DL < 31250B,Nb of sec with 6250B < Vol UL < 37500B,Nb of sec with Vol DL < 6250B,Nb of sec with Vol UL < 1250B,Social Media DL (Bytes),Social Media UL (Bytes),Google DL (Bytes),Google UL (Bytes),Email DL (Bytes),Email UL (Bytes),Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
149996,7.277826e+18,4/29/2019 7:28,451.0,4/30/2019 6:02,214.0,81230.0,208202200000000.0,33650690000.0,35483110000000.0,D20434A,32.0,0.0,52.0,65.0,,,100.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,,,42376.0,41915.0,81230763.0,Apple,Apple iPhone 8 Plus (A1897),,,,,,,223.0,229.0,3464974.0,52091.0,9967603.0,2817311.0,57639.0,633237.0,16191670.0,11763430.0,17883700.0,19678160.0,526609700.0,9197207.0,3264510.0,13487420.0,57628851.0,574175259.0
149997,7.349883e+18,4/29/2019 7:28,483.0,4/30/2019 10:41,187.0,97970.0,208201900000000.0,33663450000.0,35660510000000.0,D10223C,27.0,2.0,23.0,54.0,,,100.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,,,17264.0,16759.0,97970704.0,Apple,Apple iPhone Se (A1723),,,,,,,105.0,102.0,2344568.0,7613.0,2229420.0,2185941.0,1954414.0,167304.0,13877230.0,8288284.0,19350150.0,21293150.0,626893100.0,4735033.0,712180400.0,2457758.0,39135081.0,666648844.0
149998,1.311448e+19,4/29/2019 7:28,283.0,4/30/2019 10:46,810.0,98249.0,208201700000000.0,33621890000.0,35721210000000.0,T51102A,43.0,6.0,43.0,47.0,,,100.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,,,25003.0,28640.0,98249527.0,Apple,Apple iPhone Xs (A2097),,,,,,,104.0,108.0,1245845.0,14394.0,3850890.0,2734579.0,1525734.0,532543.0,22660510.0,1855903.0,9963942.0,5065760.0,553539500.0,13394320.0,121100900.0,11314730.0,34912224.0,592786405.0
149999,1.311448e+19,4/29/2019 7:28,696.0,4/30/2019 10:40,327.0,97910.0,208202100000000.0,33619620000.0,86186200000000.0,L88342B,37.0,5.0,34.0,37.0,,,100.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,,,13405.0,34088.0,97910631.0,Huawei,Huawei Fig-Lx1,,,,,,,43.0,82.0,801547.0,21562.0,4189773.0,3567494.0,2228270.0,622644.0,8817106.0,8305402.0,3322253.0,13172590.0,352537000.0,2529475.0,814713100.0,1406930.0,29626096.0,371895920.0
150000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1795322.0,32928.43438,5750753.0,2056542.0,1791729.0,467373.44194,11634070.0,11009410.0,11626850.0,11001750.0,422044700.0,8288398.0,421100500.0,8264799.0,,


In [6]:
# Carregando o dicionário de dados
dicionario = pd.read_excel('dicionario.xlsx')

In [15]:
# Shape
dicionario.shape

(56, 2)

In [17]:
# Amostra de dados
dicionario.head(15)
# Algumas descrições na coluna Description estão truncadas pelo tamanho do texto.

Unnamed: 0,Fields,Description
0,bearer id,xDr session identifier
1,Dur. (ms),Total Duration of the xDR (in ms)
2,Start,Start time of the xDR (first frame timestamp)
3,Start ms,Milliseconds offset of start time for the xDR ...
4,End,End time of the xDR (last frame timestamp)
5,End ms,Milliseconds offset of end time of the xDR (la...
6,Dur. (s),Total Duration of the xDR (in s)
7,IMSI,International Mobile Subscriber Identity
8,MSISDN/Number,MS International PSTN/ISDN Number of mobile - ...
9,IMEI,International Mobile Equipment Identity


In [7]:
# Definindo um valor grande para a largura da coluna.
pd.set_option('display.max_colwidth', 100)

In [19]:
# Amostra de dados
dicionario.head(10)

Unnamed: 0,Fields,Description
0,bearer id,xDr session identifier
1,Dur. (ms),Total Duration of the xDR (in ms)
2,Start,Start time of the xDR (first frame timestamp)
3,Start ms,Milliseconds offset of start time for the xDR (first frame timestamp)
4,End,End time of the xDR (last frame timestamp)
5,End ms,Milliseconds offset of end time of the xDR (last frame timestamp)
6,Dur. (s),Total Duration of the xDR (in s)
7,IMSI,International Mobile Subscriber Identity
8,MSISDN/Number,MS International PSTN/ISDN Number of mobile - customer number
9,IMEI,International Mobile Equipment Identity


## Análise Exploratória

In [8]:
# info
dataset_dsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        148848 non-null  object 
 10  Avg RTT DL (ms)     

In [9]:
# Estatísticas descritivas
dataset_dsa.describe()

Unnamed: 0,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),Avg Bearer TP UL (kbps),TCP DL Retrans. Vol (Bytes),TCP UL Retrans. Vol (Bytes),DL TP < 50 Kbps (%),50 Kbps < DL TP < 250 Kbps (%),250 Kbps < DL TP < 1 Mbps (%),DL TP > 1 Mbps (%),UL TP < 10 Kbps (%),10 Kbps < UL TP < 50 Kbps (%),50 Kbps < UL TP < 300 Kbps (%),UL TP > 300 Kbps (%),HTTP DL (Bytes),HTTP UL (Bytes),Activity Duration DL (ms),Activity Duration UL (ms),Dur. (ms).1,Nb of sec with 125000B < Vol DL,Nb of sec with 1250B < Vol UL < 6250B,Nb of sec with 31250B < Vol DL < 125000B,Nb of sec with 37500B < Vol UL,Nb of sec with 6250B < Vol DL < 31250B,Nb of sec with 6250B < Vol UL < 37500B,Nb of sec with Vol DL < 6250B,Nb of sec with Vol UL < 1250B,Social Media DL (Bytes),Social Media UL (Bytes),Google DL (Bytes),Google UL (Bytes),Email DL (Bytes),Email UL (Bytes),Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
count,149010.0,150000.0,150000.0,150000.0,149431.0,148935.0,149429.0,122172.0,122189.0,150000.0,150000.0,61855.0,53352.0,149247.0,149247.0,149247.0,149247.0,149209.0,149209.0,149209.0,149209.0,68527.0,68191.0,150000.0,150000.0,150000.0,52463.0,57107.0,56415.0,19747.0,61684.0,38158.0,149246.0,149208.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150000.0,150000.0
mean,1.013887e+19,499.1882,498.80088,104608.6,208201600000000.0,41882820000.0,48474550000000.0,109.795706,17.662883,13300.045927,1770.428647,20809910.0,759658.7,92.844754,3.069355,1.717341,1.609654,98.530142,0.776749,0.147987,0.078923,114471000.0,3242301.0,1829177.0,1408880.0,104609100.0,989.699998,340.434395,810.837401,149.257052,965.464756,141.304812,3719.787552,4022.083454,1795322.0,32928.43438,5750753.0,2056542.0,1791729.0,467373.44194,11634070.0,11009410.0,11626850.0,11001750.0,422044700.0,8288398.0,421100500.0,8264799.0,41121210.0,454643400.0
std,2.893173e+18,288.611834,288.097653,81037.62,21488090000.0,2447443000000.0,22416370000000.0,619.782739,84.793524,23971.878541,4625.3555,182566500.0,26453050.0,13.038031,6.215233,4.159538,4.82889,4.634285,3.225176,1.624523,1.295396,963194600.0,19570640.0,5696395.0,4643231.0,81037610.0,2546.52444,1445.365032,1842.162008,1219.112287,1946.387608,993.349688,9171.60901,10160.324314,1035482.0,19006.178256,3309097.0,1189917.0,1035840.0,269969.307031,6710569.0,6345423.0,6725218.0,6359490.0,243967500.0,4782700.0,243205000.0,4769004.0,11276390.0,244142900.0
min,6.917538e+18,0.0,0.0,7142.0,204047100000000.0,33601000000.0,440015200000.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,40.0,0.0,0.0,7142988.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,0.0,207.0,3.0,14.0,2.0,53.0,105.0,42.0,35.0,2516.0,59.0,3290.0,148.0,2866892.0,7114041.0
25%,7.349883e+18,250.0,251.0,57440.5,208201400000000.0,33651300000.0,35460710000000.0,32.0,2.0,43.0,47.0,35651.5,4694.75,91.0,0.0,0.0,0.0,99.0,0.0,0.0,0.0,112403.5,24322.0,14877.75,21539.75,57440790.0,20.0,10.0,26.0,2.0,39.0,3.0,87.0,106.0,899148.0,16448.0,2882393.0,1024279.0,892793.0,233383.0,5833501.0,5517965.0,5777156.0,5475981.0,210473300.0,4128476.0,210186900.0,4145943.0,33222010.0,243106800.0
50%,7.349883e+18,499.0,500.0,86399.0,208201500000000.0,33663710000.0,35722010000000.0,45.0,5.0,63.0,63.0,568730.0,20949.5,100.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,1941949.0,229733.0,39304.5,46793.5,86399980.0,128.0,52.0,164.0,8.0,288.0,8.0,203.0,217.0,1794369.0,32920.0,5765829.0,2054573.0,1793505.0,466250.0,11616020.0,11013450.0,11642220.0,10996380.0,423408100.0,8291208.0,421803000.0,8267071.0,41143310.0,455841100.0
75%,1.304243e+19,749.0,750.0,132430.2,208201800000000.0,33683490000.0,86119700000000.0,70.0,15.0,19710.75,1120.0,3768308.0,84020.25,100.0,4.0,1.0,0.0,100.0,0.0,0.0,0.0,25042900.0,1542827.0,679609.5,599095.2,132430800.0,693.5,203.0,757.0,35.0,1092.0,31.0,2650.0,2451.0,2694938.0,49334.0,8623552.0,3088454.0,2689327.0,700440.0,17448520.0,16515560.0,17470480.0,16507270.0,633174200.0,12431620.0,631691800.0,12384150.0,49034240.0,665705500.0
max,1.318654e+19,999.0,999.0,1859336.0,214074300000000.0,882397100000000.0,99001200000000.0,96923.0,7120.0,378160.0,58613.0,4294426000.0,2908226000.0,100.0,93.0,100.0,94.0,100.0,98.0,100.0,96.0,72530640000.0,1491890000.0,136536500.0,144911300.0,1859336000.0,81476.0,85412.0,58525.0,50553.0,66913.0,49565.0,604061.0,604122.0,3586064.0,65870.0,11462830.0,4121357.0,3586146.0,936418.0,23259100.0,22011960.0,23259190.0,22011960.0,843441900.0,16558790.0,843442500.0,16558820.0,78331310.0,902969600.0


Não faz sentido calcular estatísticas descritivas para variáveis Baread id, IMSI, MSISDN/Number e IMEI Embora essas colunas sejam
do tipo numérico, seus valores servem como identificadores. Mas o método describe() calcula as estatísticas de todas as colunas numéricas. Essas estatísticas estão sendo calculadas antes que os dados sejam limpos. Portanto, pode haver mudanças depois que os valores ausentes e outliers são tratados.

In [10]:
# Shape
dataset_dsa.shape

(150001, 55)

In [11]:
dicionario.shape

(56, 2)

Existem 150001 linhas e 55 colunas no dataframe. No entando, temos 56 colunas com seus nomes e descrições no dicionário.
Isso significa que há uma coluna descrita, mas não incluída no dataframe.

In [12]:
# Concatenar os datasets
df_compara_colunas = pd.concat([pd.Series(dataset_dsa.columns.tolist()), dicionario['Fields']], axis=1)

In [17]:
df_compara_colunas.columns

Index(['Coluna do Dataset', 'Coluna do Dicionário'], dtype='object')

In [16]:
# Renomeia as colunasc
df_compara_colunas.rename(columns = {0:'Coluna do Dataset', 'Fields': 'Coluna do Dicionário'}, inplace = True)

In [18]:
df_compara_colunas.head()

Unnamed: 0,Coluna do Dataset,Coluna do Dicionário
0,Bearer Id,bearer id
1,Start,Dur. (ms)
2,Start ms,Start
3,End,Start ms
4,End ms,End
