# 01_eda

## Importando bibliotecas

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


## Carregando os dados

In [2]:
# Carregar os dados
file_path = '../data/raw/telco_churn.csv'
df = pd.read_csv(file_path)

In [3]:
# Visualizar as primeiras linhas
display(df.head())
print("\nShape:", df.shape)
print("\nTipos de dados:\n", df.dtypes)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes



Shape: (7043, 21)

Tipos de dados:
 customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


## Processamento

In [4]:
# Converter TotalCharges (que pode ter valores nulos como string vazia)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
# Verificar valores ausentes
df_nulls = df.isnull().sum()
print("\nValores ausentes:\n", df_nulls[df_nulls > 0])

# Preencher ou remover valores ausentes
df.dropna(inplace=True)


Valores ausentes:
 TotalCharges    11
dtype: int64


In [6]:
# Estatísticas descritivas
display(df.describe(include='all'))

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7032,7032,7032.0,7032,7032,7032.0,7032,7032,7032,7032,...,7032,7032,7032,7032,7032,7032,7032,7032.0,7032.0,7032
unique,7032,2,,2,2,,2,3,3,3,...,3,3,3,3,3,2,4,,,2
top,7590-VHVEG,Male,,No,No,,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,,,No
freq,1,3549,,3639,4933,,6352,3385,3096,3497,...,3094,3472,2809,2781,3875,4168,2365,,,5163
mean,,,0.1624,,,32.421786,,,,,...,,,,,,,,64.798208,2283.300441,
std,,,0.368844,,,24.54526,,,,,...,,,,,,,,30.085974,2266.771362,
min,,,0.0,,,1.0,,,,,...,,,,,,,,18.25,18.8,
25%,,,0.0,,,9.0,,,,,...,,,,,,,,35.5875,401.45,
50%,,,0.0,,,29.0,,,,,...,,,,,,,,70.35,1397.475,
75%,,,0.0,,,55.0,,,,,...,,,,,,,,89.8625,3794.7375,


In [7]:
# Gráfico interativo
fig = px.histogram(
    df, 
    x='Contract', 
    color='Churn',
    barmode='group',
    title='Churn por Tipo de Contrato',
    labels={'Contract': 'Tipo de Contrato', 'Churn': 'Churn'},
    color_discrete_map={'Yes': 'crimson', 'No': 'royalblue'}
)

# Layout e tamanho
fig.update_layout(
    xaxis_title='Tipo de Contrato',
    yaxis_title='Contagem',
    title_x=0.5,
    legend_title='Churn',
    width=700,
    height=450
)

# Salvar imagem
fig.write_image("../reports/figures/churn_por_contrato.png", scale=2)

# Mostrar no notebook
fig.show()


In [8]:
# Separar os dados
charges_churn_yes = df[df['Churn'] == 'Yes']['MonthlyCharges']
charges_churn_no = df[df['Churn'] == 'No']['MonthlyCharges']

# Criar histograma com curva de densidade (kde-like)
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=charges_churn_yes,
    name='Churn = Yes',
    opacity=0.6,
    marker_color='crimson',
    nbinsx=40
))

fig.add_trace(go.Histogram(
    x=charges_churn_no,
    name='Churn = No',
    opacity=0.6,
    marker_color='seagreen',
    nbinsx=40
))

# Layout
fig.update_layout(
    barmode='overlay',
    title='Distribuição de MonthlyCharges por Churn',
    xaxis_title='Monthly Charges',
    yaxis_title='Contagem',
    legend_title='Churn',
    width=800,
    height=450,
    title_x=0.5
)

# Salvar imagem
fig.write_image("../reports/figures/distribuicao_charges_churn.png", scale=2)

# Mostrar no notebook
fig.show()

In [9]:
# Gráfico de boxplot com Plotly
fig = px.box(
    df,
    x='Churn',
    y='tenure',
    color='Churn',
    title='Tempo de Contrato (tenure) por Churn',
    width=800,
    height=450,
    color_discrete_map={'Yes': 'crimson', 'No': 'seagreen'}
)

# Layout
fig.update_layout(
    xaxis_title='Churn',
    yaxis_title='Tempo de Contrato (meses)',
    title_x=0.5,
    showlegend=False
)

# Salvar a imagem
fig.write_image("../reports/figures/tenure_churn_boxplot.png", scale=2)

# Exibir no notebook
fig.show()

In [10]:
# Convertendo variável Churn para 0 e 1
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


# Calcular a correlação
corr = df.corr(numeric_only=True)

# Converter para long-form para plotar com Plotly
corr_long = corr.reset_index().melt(id_vars='index')
corr_long.columns = ['Var1', 'Var2', 'Correlation']

# Heatmap com Plotly
fig = px.imshow(
    corr.values,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='RdBu',
    text_auto=".2f",
    aspect="auto",
    title="Correlação entre Variáveis Numéricas",
    width=800,
    height=700
)

fig.update_layout(title_x=0.5)

# Salvar a imagem
pio.write_image(fig, "../reports/figures/correlation_matrix_churn.png", scale=2)

# Exibir no notebook
fig.show()
