# 1. Preparación de los datos

In [1]:
import pandas as pd

In [21]:
df = pd.read_csv('../datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(len(df))
df.head().T

7043


Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [9]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [22]:
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)
df[total_charges.isnull()][['customerID', 'TotalCharges', 'Churn']]

Unnamed: 0,customerID,TotalCharges,Churn
488,4472-LVYGI,0.0,No
753,3115-CZMZD,0.0,No
936,5709-LVOEQ,0.0,No
1082,4367-NUYAO,0.0,No
1340,1371-DWPAZ,0.0,No
3331,7644-OMVMY,0.0,No
3826,3213-VVOLG,0.0,No
4380,2520-SGTTA,0.0,No
5218,2923-ARZLG,0.0,No
6670,4075-WKNIU,0.0,No


In [24]:
replacer = lambda str: str.lower().str.replace(' ', '_')
df.columns = replacer(df.columns.str)
#hace lo mismo que la linea de arriba pero para el interior de las columnas y no para solo los titulos de las columnas
for col in list(df.dtypes[df.dtypes == 'object'].index):
    df[col] = replacer(df[col].str)
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [25]:
df.churn = (df.churn == 'yes').astype(int) # convierte la columna churn a 0 y 1 y le dice que yes es el true, el resto es false

In [26]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']
df[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [27]:
# Separacion de los datos
from sklearn.model_selection import train_test_split

#Dividimos en entrenamiento y test.
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1) #es igual que poenr train_size=0.8

#Dividimos a su vez el conjunto df_train_full en entrenamiento y validación

df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1) # normalmente se pone x_train, x_val, x_test

#Guarda las etiquetas de los ejemplos en una variable
y_train = df_train.churn.values
y_val = df_val.churn.values

# elimina la columna de etiquetas del conjunto de datos
del df_train['churn']
del df_val['churn']

In [30]:
print(len(df_train))
print(len(y_train))

3774
3774


# 2. Análisis de imporatncia de propiedades

In [31]:
global_mean = df_train_full.churn.mean()
round(global_mean, 3)

np.float64(0.27)

In [None]:
# Calcula la media de churn para cada género es decir lo que nos da es el porcentaje de clientes que se van de un genero, con respecto
#a todos los clientes de ese genero. es decir solo compara mujeres con mujeres y hombres con hombres
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print(round(female_mean, 3))
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print(round(male_mean, 3))

0.277
0.263


In [34]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print(round(partner_yes, 3))
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print(round(partner_no, 3))

0.205
0.33


In [None]:
from sklearn.metrics import mutual_info_score

calculate_mi = lambda col: mutual_info_score(col, df_train_full.churn)

#Con categorical es una lista con los nombres de las columnas categoricas, 
# aplicamos la funcion calculate_mi a cada una de las columnas categoricas
#y nos devuelve una serie con los valores de mi para cada columna, este mi
# nos dice la relacion que tiene cada variable con la variable objetivo churn

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [None]:
#tenure indica los meses que lleva el cliente con la compañia
print(df_train_full[numerical].corrwith(df_train_full.churn))

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64


In [None]:
print(round(df_train_full[df_train_full.tenure <= 2].churn.mean(), 3))
print(round(df_train_full[(df_train_full.tenure > 3) & 
                          (df_train_full.tenure <= 12)].churn.mean(), 3))
print(round(df_train_full[df_train_full.tenure > 12].churn.mean(), 3))

print(round(df_train_full[df_train_full.monthlycharges < 20].churn.mean(), 3))
print(round(df_train_full[(df_train_full.monthlycharges > 21) & 
                          (df_train_full.monthlycharges <= 50)].churn.mean(), 3))
print(round(df_train_full[df_train_full.monthlycharges > 50].churn.mean(), 3))

0.595
0.391
0.176
0.088
0.223
0.325


# 3. Ingeniería de Propiedades

In [None]:
#orient='records' hace que cada fila del dataframe se convierta en un diccionario
train_dict = df_train[categorical + numerical].to_dict(orient='records')
dict(sorted(train_dict[0].items()))

{'contract': 'month-to-month',
 'dependents': 'no',
 'deviceprotection': 'no',
 'gender': 'male',
 'internetservice': 'fiber_optic',
 'monthlycharges': 85.55,
 'multiplelines': 'no',
 'onlinebackup': 'yes',
 'onlinesecurity': 'no',
 'paperlessbilling': 'yes',
 'partner': 'no',
 'paymentmethod': 'electronic_check',
 'phoneservice': 'yes',
 'seniorcitizen': 1,
 'streamingmovies': 'yes',
 'streamingtv': 'no',
 'techsupport': 'no',
 'tenure': 5,
 'totalcharges': 408.5}

In [None]:
from sklearn.feature_extraction import DictVectorizer
# crea el objeto DictVectorizer, que convierte listas de diccionarios en matrices numéricas
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)


0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [42]:
X_train = dv.transform(train_dict)

In [43]:
X_train[0]

array([  1.  ,   0.  ,   0.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         0.  ,   1.  ,   0.  ,   1.  ,   0.  ,  85.55,   1.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,   0.  ,
         1.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   1.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   0.  ,   0.  ,   5.  , 408.5 ])

In [44]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

# 4. Entrenamiento del modelo

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')

model.fit(X_train, y_train)

In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
# Lanzamos predicciones sobre el conjunto de validación
y_pred = model.predict_proba(X_val)
y_pred

array([[0.99142715, 0.00857285],
       [0.79028815, 0.20971185],
       [0.78364575, 0.21635425],
       ...,
       [0.3566433 , 0.6433567 ],
       [0.8105598 , 0.1894402 ],
       [0.8726199 , 0.1273801 ]], shape=(1860, 2))

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.00857285, 0.20971185, 0.21635425, ..., 0.6433567 , 0.1894402 ,
       0.1273801 ], shape=(1860,))

In [None]:
churn = y_pred >=0.5
churn

array([False, False, False, ...,  True, False, False], shape=(1860,))

In [None]:
y_val

array([0, 0, 0, ..., 1, 0, 0], shape=(1860,))

In [None]:
round((y_val == churn).mean(), 3)

np.float64(0.805)

In [None]:
round((y_val == churn).mean(), 3)

np.float64(0.805)

# 5. Serialización de modelo

In [None]:
import pickle

In [None]:
with open('models/churn-model.pck', 'wb') as f:
    pickle.dump((dv,model), f)