# Análise Exploratória

### Bibliotecas necessarias + opções de visualização do Pandas

In [102]:
import pandas as pd
import numpy as np
import json
from urllib.request import urlopen

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

notrandomseed = 484


### Recebendo os dados e abrindo em um DataFrame

In [103]:
data_url = "https://github.com/sthemonica/alura-voz/blob/main/Dados/Telco-Customer-Churn.json?raw=true"
response = urlopen(data_url)
data_json = json.loads(response.read())

df = pd.json_normalize(data_json, max_level=2, sep='_')
df.head()

Unnamed: 0,customerID,Churn,customer_gender,customer_SeniorCitizen,customer_Partner,customer_Dependents,customer_tenure,phone_PhoneService,phone_MultipleLines,internet_InternetService,internet_OnlineSecurity,internet_OnlineBackup,internet_DeviceProtection,internet_TechSupport,internet_StreamingTV,internet_StreamingMovies,account_Contract,account_PaperlessBilling,account_PaymentMethod,account_Charges_Monthly,account_Charges_Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,No,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,No,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


### Analisando superficialmento informações sobre o DataFrame

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerID                 7267 non-null   object 
 1   Churn                      7267 non-null   object 
 2   customer_gender            7267 non-null   object 
 3   customer_SeniorCitizen     7267 non-null   int64  
 4   customer_Partner           7267 non-null   object 
 5   customer_Dependents        7267 non-null   object 
 6   customer_tenure            7267 non-null   int64  
 7   phone_PhoneService         7267 non-null   object 
 8   phone_MultipleLines        7267 non-null   object 
 9   internet_InternetService   7267 non-null   object 
 10  internet_OnlineSecurity    7267 non-null   object 
 11  internet_OnlineBackup      7267 non-null   object 
 12  internet_DeviceProtection  7267 non-null   object 
 13  internet_TechSupport       7267 non-null   objec

In [105]:
df_check = pd.DataFrame({'coluna': df.columns, 'check': '-'})
df_check

Unnamed: 0,coluna,check
0,customerID,-
1,Churn,-
2,customer_gender,-
3,customer_SeniorCitizen,-
4,customer_Partner,-
5,customer_Dependents,-
6,customer_tenure,-
7,phone_PhoneService,-
8,phone_MultipleLines,-
9,internet_InternetService,-


Verificando linhas duplicadas

In [106]:
df.duplicated().sum()

0

Verificando quantos valores unicos existem em cada  (7267 é o número máximo possível)

In [107]:
df_check['unique'] = df.nunique().values

Verificando quantidades de nulos

In [108]:
df_check['isnull'] = df.isnull().sum().values

Verificando espações em branco e valores faltantes (nan).

In [109]:
colunas_object = df.select_dtypes('object').columns

temp_series1 = df[colunas_object].apply(lambda x: x.str.strip().isin(['']).sum())

temp_series2 = df.select_dtypes(['float', 'integer']).isnull().sum()

temp_series = pd.concat([temp_series1, temp_series2])
temp_series.name = 'blank'

df_check = df_check.merge(temp_series, how='left', left_on='coluna', right_index=True)

In [110]:
df_check

Unnamed: 0,coluna,check,unique,isnull,blank
0,customerID,-,7267,0,0
1,Churn,-,3,0,224
2,customer_gender,-,2,0,0
3,customer_SeniorCitizen,-,2,0,0
4,customer_Partner,-,2,0,0
5,customer_Dependents,-,2,0,0
6,customer_tenure,-,73,0,0
7,phone_PhoneService,-,2,0,0
8,phone_MultipleLines,-,3,0,0
9,internet_InternetService,-,3,0,0


### Analise coluna: customerID

In [None]:
df['customerID'].sample(10, random_state=notrandomseed)

1630    2320-JRSDE
5777    7893-IXHRQ
3281    4573-JKNAE
967     1357-BIJKI
2505    3500-NSDOA
4466    6124-ACRHJ
1309    1891-FZYSA
3368    4678-DVQEO
5944    8111-SLLHI
2998    4163-HFTUK
Name: customerID, dtype: object

Analisando coluna: Churn

In [None]:
df['Churn'].sample(15, random_state=484)

1630    Yes
5777    Yes
3281     No
967      No
2505     No
4466     No
1309    Yes
3368    Yes
5944    Yes
2998     No
3249       
4162     No
7186     No
5293    Yes
7031       
Name: Churn, dtype: object