# Customer Segmentation - Voice


## Problem statement

###### “What are the identifiable customer segments based on their Life Value?”

## Hypothesis generation

Feature list considered for clustering
* Data M2O/M2M
* Time of the date
* Day of the week
* Incoming/Outgoing

## Data exploration / transformation

In [1]:
% pylab inline 

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans

Populating the interactive namespace from numpy and matplotlib


In [2]:
# reading the sample data
df = pd.read_csv('Voice.csv')

In [3]:
df.head(n=2)

Unnamed: 0,ChargingPartyNumber,wkday,wkend,ofcTime,ngtTime,trvlTime,outgoing,incoming,m2m,m2o
0,142354223,118306,72052,35606,128773,25979,190358,56071,164699,25659
1,138848743,19551,10253,16396,4899,8509,29804,9887,22229,7575


In [5]:
dfd = df[['outgoing','incoming']]

In [None]:
dfd.loc[:,'total'] = dfd.loc[:,'outgoing'] + dfd.loc[:,'incoming']

In [7]:
dfd.head()

Unnamed: 0,outgoing,incoming,total
0,190358,56071,246429
1,29804,9887,39691
2,4302,7189,11491
3,1115,25106,26221
4,2940,2178,5118


In [None]:
dfd.loc[:,('outgoing','incoming')] = dfd.loc[:,('outgoing','incoming')].div(dfd["total"],axis=0)

In [9]:
dfd.head()

Unnamed: 0,outgoing,incoming,total
0,0.772466,0.227534,246429
1,0.750901,0.249099,39691
2,0.37438,0.62562,11491
3,0.042523,0.957477,26221
4,0.574443,0.425557,5118


In [10]:
# Assign clustering
dfdc = dfd[['outgoing','incoming']]
random.seed(123)
km = KMeans(n_clusters=4).fit(dfdc)

# cluster representation
dfdc.loc[:,'cluster'] =  km.labels_
clustergrp = pd.concat([dfdc.groupby('cluster').mean().round(2), dfdc.groupby('cluster')['cluster'].count()], axis=1)
print(clustergrp)

         outgoing  incoming  cluster
cluster                             
0            0.15      0.85   376899
1            0.56      0.44   643289
2            0.37      0.63   647970
3            0.78      0.22   329139


In [11]:
df['cluster_voice_direction']=km.labels_

In [12]:
dfn = df[['m2m','m2o','outgoing']]

In [13]:
dfn = dfn.query('outgoing > 0')
dfn.head()

Unnamed: 0,m2m,m2o,outgoing
0,164699,25659,190358
1,22229,7575,29804
2,0,4302,4302
3,1083,32,1115
4,2812,128,2940


In [14]:
dfn.loc[:,('m2m','m2o')] = dfn.loc[:,('m2m','m2o')].div(dfn["outgoing"],axis=0)

In [15]:
dfn.head()

Unnamed: 0,m2m,m2o,outgoing
0,0.865207,0.134793,190358
1,0.745839,0.254161,29804
2,0.0,1.0,4302
3,0.9713,0.0287,1115
4,0.956463,0.043537,2940


In [16]:
# Assign clustering
dfnc = dfn[['m2o','m2m']]
random.seed(123)
km = KMeans(n_clusters=4).fit(dfnc)

# cluster representation
dfnc.loc[:,'cluster'] =  km.labels_
clustergrp = pd.concat([dfnc.groupby('cluster').mean().round(2), dfnc.groupby('cluster')['cluster'].count()], axis=1)
print(clustergrp)

          m2o   m2m  cluster
cluster                     
0        0.30  0.70   549705
1        0.83  0.17   280149
2        0.54  0.46   423561
3        0.08  0.92   720293


In [17]:
df['cluster_voice_network']='NA'
df.loc[(df.outgoing >0), 'cluster_voice_network'] = km.labels_

In [18]:
df.head(5)

Unnamed: 0,ChargingPartyNumber,wkday,wkend,ofcTime,ngtTime,trvlTime,outgoing,incoming,m2m,m2o,cluster_voice_direction,cluster_voice_network
0,142354223,118306,72052,35606,128773,25979,190358,56071,164699,25659,3,3
1,138848743,19551,10253,16396,4899,8509,29804,9887,22229,7575,3,0
2,139598327,2519,1783,2090,1134,1078,4302,7189,0,4302,2,1
3,119956243,425,690,651,341,123,1115,25106,1083,32,0,3
4,134551681,2159,781,776,1612,552,2940,2178,2812,128,1,3


In [19]:
dft = df[['ofcTime','ngtTime','trvlTime','outgoing']]

In [20]:
dft = dft.query('outgoing > 0')
dft.head()

Unnamed: 0,ofcTime,ngtTime,trvlTime,outgoing
0,35606,128773,25979,190358
1,16396,4899,8509,29804
2,2090,1134,1078,4302
3,651,341,123,1115
4,776,1612,552,2940


In [21]:
dft.loc[:,('ofcTime','ngtTime','trvlTime')] = dft.loc[:,('ofcTime','ngtTime','trvlTime')].div(dft["outgoing"],axis=0)

In [22]:
# Assign clustering
dftc = dft[['ofcTime','ngtTime','trvlTime']]
random.seed(123)
km = KMeans(n_clusters=7).fit(dftc)

# cluster representation
dftc.loc[:,'cluster'] =  km.labels_
clustergrp = pd.concat([dftc.groupby('cluster').mean().round(2), dftc.groupby('cluster')['cluster'].count()], axis=1)
print(clustergrp)

         ofcTime  ngtTime  trvlTime  cluster
cluster                                     
0           0.77     0.05      0.17   207531
1           0.47     0.26      0.27   327971
2           0.45     0.10      0.45   446932
3           0.25     0.09      0.66   140760
4           0.22     0.56      0.21   119340
5           0.60     0.08      0.31   506281
6           0.29     0.30      0.40   224893


In [23]:
df['cluster_voice_time']='NA'
df.loc[(df.outgoing >0), 'cluster_voice_time'] = km.labels_

In [24]:
dfw = df[['wkday','wkend','outgoing']]

In [25]:
dfw = dfw.query('outgoing > 0')
dfw.head()

Unnamed: 0,wkday,wkend,outgoing
0,118306,72052,190358
1,19551,10253,29804
2,2519,1783,4302
3,425,690,1115
4,2159,781,2940


In [26]:
dfw.loc[:,('wkday','wkend')] = dfw.loc[:,('wkday','wkend')].div(dfw["outgoing"],axis=0)

In [27]:
# Assign clustering
dfwc = dfw[['wkday','wkend']]
random.seed(123)
km = KMeans(n_clusters=5).fit(dfwc)

# cluster representation
dfwc.loc[:,'cluster'] =  km.labels_
clustergrp = pd.concat([dfwc.groupby('cluster').mean().round(2), dfwc.groupby('cluster')['cluster'].count()], axis=1)
print(clustergrp)

         wkday  wkend  cluster
cluster                       
0         0.90   0.10   271088
1         0.68   0.32   682980
2         0.30   0.69    48607
3         0.78   0.22   686459
4         0.55   0.44   284574


In [28]:
df['cluster_voice_day_of_week']='NA'
df.loc[(df.outgoing >0), 'cluster_voice_day_of_week'] = km.labels_

In [29]:
df.head()

Unnamed: 0,ChargingPartyNumber,wkday,wkend,ofcTime,ngtTime,trvlTime,outgoing,incoming,m2m,m2o,cluster_voice_direction,cluster_voice_network,cluster_voice_time,cluster_voice_day_of_week
0,142354223,118306,72052,35606,128773,25979,190358,56071,164699,25659,3,3,4,1
1,138848743,19551,10253,16396,4899,8509,29804,9887,22229,7575,3,0,5,1
2,139598327,2519,1783,2090,1134,1078,4302,7189,0,4302,2,1,1,4
3,119956243,425,690,651,341,123,1115,25106,1083,32,0,3,1,2
4,134551681,2159,781,776,1612,552,2940,2178,2812,128,1,3,4,3


In [30]:
df.round(0).to_csv('voice.csv',index=False)