In [159]:
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [160]:
# Read file and parse dates if any present
df = pd.read_csv('telecom_churn_data.csv', parse_dates = True)

In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 226 entries, mobile_number to sep_vbc_3g
dtypes: float64(179), int64(35), object(12)
memory usage: 172.4+ MB


In [162]:
df.head()

Unnamed: 0,mobile_number,circle_id,loc_og_t2o_mou,std_og_t2o_mou,loc_ic_t2o_mou,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,last_date_of_month_9,arpu_6,...,sachet_3g_9,fb_user_6,fb_user_7,fb_user_8,fb_user_9,aon,aug_vbc_3g,jul_vbc_3g,jun_vbc_3g,sep_vbc_3g
0,7000842753,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,197.385,...,0,1.0,1.0,1.0,,968,30.4,0.0,101.2,3.58
1,7001865778,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,34.047,...,0,,1.0,1.0,,1006,0.0,0.0,0.0,0.0
2,7001625959,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,167.69,...,0,,,,1.0,1103,0.0,0.0,4.17,0.0
3,7001204172,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,221.338,...,0,,,,,2491,0.0,0.0,0.0,0.0
4,7000142493,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,261.636,...,0,0.0,,,,1526,0.0,0.0,0.0,0.0


In [163]:
# Check the rows and columns in the dataframe
df.shape

(99999, 226)

In [164]:
# Check count of unique mobile numbers
len(df['mobile_number'].unique())

99999

In [165]:
# Since all the mobile numbers are unique, let's set the "mobile number" as the INDEX
df = df.set_index('mobile_number')

In [166]:
# Check for columns with all values as 'na'
print('The number of columns with all values missing/NA are:\r\n',len([i for i in df.columns if df[i].isna().all()]))

The number of columns with all values missing/NA are:
 0


In [167]:
#Check if there are rows with all values as 'na'
print('The number of rows with all values missing/NA are:\r\n',len(df.index[df.isna().all(1)]))

The number of rows with all values missing/NA are:
 0


In [168]:
print("Assumption 1: If minutes of usage values are missing, assume that the customers have not made calls.",
      "So, impute 0 (zero) where the minutes of usage (mou, ic,og) are missing.")

Assumption 1: If minutes of usage values are missing, assume that the customers have not made calls. So, impute 0 (zero) where the minutes of usage (mou, ic,og) are missing.


In [169]:
print("Assumption 2: If the recharge values are missing, the customers have not recharged.",
      "So, impute 0 (zero) where the recharge values (total_rech, max_rech, count_rech, av_rech) are missing.")

Assumption 2: If the recharge values are missing, the customers have not recharged. So, impute 0 (zero) where the recharge values (total_rech, max_rech, count_rech, av_rech) are missing.


In [170]:
print("Assumption 3: If the values for special packs such as Night Packs, FB Packs are missing, customers did not buy such packs.",
      "So, impute 0 (zero) where the recharge values (total_rech, max_rech, count_rech) are missing.")

Assumption 3: If the values for special packs such as Night Packs, FB Packs are missing, customers did not buy such packs. So, impute 0 (zero) where the recharge values (total_rech, max_rech, count_rech) are missing.


In [174]:
for i in df.filter(regex="(mou|og|ic|total_rech|max_rech|count_rech|av_rech|user)").columns:
    df[i] = df[i].fillna(value = 0)

In [175]:
# Check the number of missing/NA values in each column
null_columns=df.columns[df.isnull().any()]
print('Following is the count of null values in each column:')
print(df[null_columns].isnull().sum())

Following is the count of null values in each column:
last_date_of_month_7          601
last_date_of_month_8         1100
last_date_of_month_9         1659
date_of_last_rech_6          1607
date_of_last_rech_7          1767
date_of_last_rech_8          3622
date_of_last_rech_9          4760
date_of_last_rech_data_6    74846
date_of_last_rech_data_7    74428
date_of_last_rech_data_8    73660
date_of_last_rech_data_9    74077
arpu_3g_6                   74846
arpu_3g_7                   74428
arpu_3g_8                   73660
arpu_3g_9                   74077
arpu_2g_6                   74846
arpu_2g_7                   74428
arpu_2g_8                   73660
arpu_2g_9                   74077
dtype: int64


In [176]:
df['total'] = df[['total_rech_amt_6','total_rech_amt_7']].sum(axis=1)

In [177]:
threshold = float(df['total'].quantile([0.70]))

In [178]:
df = df[df['total'] > threshold]

In [179]:
df.shape

(29979, 226)

In [180]:
# Label churn and non-churn customers
df['churn'] = np.where(
            (
                (df['total_ic_mou_9'] == 0.0) | 
                (df['total_og_mou_9'] == 0.0)
            ) & 
            (
                (df['vol_2g_mb_9'] == 0.0) & 
                (df['vol_3g_mb_9'] == 0.0)
            ),1,0
        )

In [183]:
# Look at the distribution of churn and non-churn customers
df['churn'].value_counts()

0    27089
1     2890
Name: churn, dtype: int64

In [184]:
# List out columns with '9' to be removed
df.filter(regex = '9').columns

Index(['last_date_of_month_9', 'arpu_9', 'onnet_mou_9', 'offnet_mou_9',
       'roam_ic_mou_9', 'roam_og_mou_9', 'loc_og_t2t_mou_9',
       'loc_og_t2m_mou_9', 'loc_og_t2f_mou_9', 'loc_og_t2c_mou_9',
       'loc_og_mou_9', 'std_og_t2t_mou_9', 'std_og_t2m_mou_9',
       'std_og_t2f_mou_9', 'std_og_t2c_mou_9', 'std_og_mou_9', 'isd_og_mou_9',
       'spl_og_mou_9', 'og_others_9', 'total_og_mou_9', 'loc_ic_t2t_mou_9',
       'loc_ic_t2m_mou_9', 'loc_ic_t2f_mou_9', 'loc_ic_mou_9',
       'std_ic_t2t_mou_9', 'std_ic_t2m_mou_9', 'std_ic_t2f_mou_9',
       'std_ic_t2o_mou_9', 'std_ic_mou_9', 'total_ic_mou_9', 'spl_ic_mou_9',
       'isd_ic_mou_9', 'ic_others_9', 'total_rech_num_9', 'total_rech_amt_9',
       'max_rech_amt_9', 'date_of_last_rech_9', 'last_day_rch_amt_9',
       'date_of_last_rech_data_9', 'total_rech_data_9', 'max_rech_data_9',
       'count_rech_2g_9', 'count_rech_3g_9', 'av_rech_amt_data_9',
       'vol_2g_mb_9', 'vol_3g_mb_9', 'arpu_3g_9', 'arpu_2g_9',
       'night_pck_user

In [185]:
# Remove columns with '9'
df = df.drop(df.filter(regex = '9').columns, axis=1)

In [186]:
df.shape

(29979, 173)