<a href="https://colab.research.google.com/github/jojoconverteo/Evaneos/blob/main/Evaneos_Churn_(clustering).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![](https://www.offremedia.com/sites/default/files/vignette/article/converteo-logo.png)

# **Churn : Analyse non supervisée**


**Contexte** : On cherche à savoir si un client mobile est à risque de désengagement


-------------------------------

**Dictionnaire :**

Churn (Cible) :
-  1 if customer cancelled service, 0 if not

AccountWeeks : 
- number of weeks customer has had active account


DataPlan : 
- 1 if customer has data plan, 0 if not

DataUsage : 
 - gigabytes of monthly data usage


CustServCalls : 
- number of calls into customer service


DayMins :
- average daytime minutes per month


DayCalls : 
- average number of daytime calls


MonthlyCharge :
- average monthly bill


OverageFee :
- largest overage fee in last 12 months

ContractRenewal :
- 1 if customer recently renewed contract, 0 if not


RoamMins : 
- average number of roaming minutes [LE ROAMING, C'EST QUOI ?](https://www.sfrbusiness.fr/room/communications-unifiees/roaming-c-est-quoi.html)

In [8]:
#@title


!pip install plotly --upgrade
!pip install -U pandas_profiling

from pandas_profiling import ProfileReport
from google.colab import drive
import os 
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import typing
from typing import List
import numpy as np


import warnings
warnings.filterwarnings('ignore')

def func_create_noise(df_train_data: pd.DataFrame, coloumns_cat_2_category: List) -> pd.DataFrame:
  """
  Fonction qui permet de creer du bruit


  Parameters:
  ----------------------------
    df_train_data: pd.DataFrame 
    Dataframe d'entree 


  Return:
  -----------------------------
    df_train_data_suffle: pd.DataFrame
    Dataframe modifier  

  """
  dict_create_noise_columns_cat = {1 : 'Yes', 0 : 'No'}
  df_train_data_suffle = df_train_data.sample(frac=1)

  for col in columns_cat_2_category: 
    df_train_data_suffle[col] = df_train_data_suffle[col].apply(lambda x: dict_create_noise_columns_cat[x])

  return df_train_data_suffle




drive.mount('/content/drive')
sep = os.sep
str_path_to_file = f"/content/drive/My Drive/Cours Data/Classification/Data/telecom_churn.csv"

df_train_data_suffle = pd.read_csv(str_path_to_file, encoding='ascii')

columns_cat_2_category=['Churn', 'ContractRenewal', 'DataPlan']
df_train_data_suffle = func_create_noise(df_train_data_suffle, columns_cat_2_category)
df_train_data_suffle = df_train_data_suffle.sample(frac=1)

df_train_data_suffle_cat = df_train_data_suffle.select_dtypes(include=object)
df_train_data_suffle_num = df_train_data_suffle.select_dtypes(exclude=object)

Requirement already up-to-date: plotly in /usr/local/lib/python3.6/dist-packages (4.12.0)
Requirement already up-to-date: pandas_profiling in /usr/local/lib/python3.6/dist-packages (2.9.0)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


-----------------
 
### K-means
 







In [9]:
pd.get_dummies(df_train_data_suffle_cat)

Unnamed: 0,Churn_No,Churn_Yes,ContractRenewal_No,ContractRenewal_Yes,DataPlan_No,DataPlan_Yes
1004,1,0,0,1,0,1
56,1,0,0,1,1,0
1181,1,0,0,1,0,1
1626,1,0,0,1,1,0
683,1,0,0,1,1,0
...,...,...,...,...,...,...
1214,1,0,0,1,1,0
2275,1,0,1,0,1,0
2643,1,0,0,1,1,0
2722,1,0,0,1,1,0


-------------------------------------
One-hot-encoding amelioré

In [10]:
#@title

df_train_data_one_hot_encoding = pd.get_dummies(df_train_data_suffle_cat, drop_first=True)
df_train_data_one_hot_encoding

Unnamed: 0,Churn_Yes,ContractRenewal_Yes,DataPlan_Yes
1004,0,1,1
56,0,1,0
1181,0,1,1
1626,0,1,0
683,0,1,0
...,...,...,...
1214,0,1,0
2275,0,0,0
2643,0,1,0
2722,0,1,0


In [11]:
#@title

df_train_data_kmeans = pd.concat([df_train_data_suffle_num, df_train_data_one_hot_encoding], axis=1)
df_train_data_kmeans

Unnamed: 0,AccountWeeks,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,Churn_Yes,ContractRenewal_Yes,DataPlan_Yes
1004,64,3.32,1,201.3,101,80.2,7.19,12.3,0,1,1
56,141,0.00,1,126.9,98,37.0,9.00,8.0,0,1,0
1181,67,2.03,1,245.4,89,75.3,7.41,7.5,0,1,1
1626,149,0.37,1,207.3,115,56.7,9.92,8.6,0,1,0
683,123,0.00,1,159.1,94,48.0,12.08,6.5,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1214,122,0.29,2,107.9,88,41.9,11.79,9.5,0,1,0
2275,105,0.00,0,228.4,100,52.0,7.26,7.7,0,0,0
2643,74,0.00,3,124.8,114,33.0,6.65,10.6,0,1,0
2722,98,0.39,2,136.1,82,40.9,7.82,10.1,0,1,0


In [12]:
#@title

from sklearn.preprocessing import StandardScaler

df_train_data_kmeans = pd.concat([df_train_data_suffle_num, df_train_data_one_hot_encoding], axis=1)
sk_scaler = StandardScaler()
sk_scaler.fit(df_train_data_kmeans)
array_train_data_kmeans = sk_scaler.transform(df_train_data_kmeans)

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
 
 
inertia = []
for n_clusters in range(2, 14):
   kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(array_train_data_kmeans)
   inertia.append(kmeans.inertia_ / n_clusters)
 
inertias = pd.DataFrame({'n_clusters': range(2, 14), 'inertia': inertia})
 
fig = px.line(inertias, x="n_clusters", y="inertia", title='Inertie (somme des variances des clusters)')
fig.show()



In [13]:
#@title
kmeans = KMeans(n_clusters=3, random_state=0).fit(array_train_data_kmeans)

df_train_data_kmeans['Cluster'] = kmeans.labels_

In [14]:
#@title

import plotly.graph_objects as go

list_col_num = [w for w in df_train_data_suffle_num.columns]
list_col_little = ['DataUsage', 'CustServCalls', 'OverageFee', 'RoamMins']
list_col_num_big = [w for w in list_col_num if w not in list_col_little]
list_col_cat = [w for w in df_train_data_kmeans.columns if w not in list_col_num and w != 'Cluster']

from typing import List

def return_viz_kmeans_compare(df_train_data_kmeans: pd.DataFrame, list_:List, *args) -> None:
  fig = go.Figure(data=[
      go.Bar(name='Cluster_0', x=list_, y=df_train_data_kmeans.query('Cluster == 0')[list_].mean().to_list()),
      go.Bar(name='Cluster_1', x=list_, y=df_train_data_kmeans.query('Cluster == 1')[list_].mean().to_list()),
      go.Bar(name='Cluster_2', x=list_, y=df_train_data_kmeans.query('Cluster == 2')[list_].mean().to_list())
  ])

  fig.update_layout(barmode='group')
  fig.show()



list_clusteur = ['Cluster_0', 'Cluster_1', 'Cluster_3']
return_viz_kmeans_compare(df_train_data_kmeans , list_col_num_big)
return_viz_kmeans_compare(df_train_data_kmeans , list_col_little)
return_viz_kmeans_compare(df_train_data_kmeans , list_col_cat)