#### TELECOM CUSTOMER CHURN : DATA 428 PROJECT
# 3(B). Creating Ye-Johnson transformed data and Saving the File

- For use by other Notebooks
- All initial steps of data manipulation are same as Notebooks No. 2 & 3
- Resorted to this due to a clash of sci-kit learn versions :-)


In [1]:
import pandas as pd
import numpy as np
import scipy as sp 
import scipy.stats as sps
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [108]:
# Using dataset with better column names

data = pd.read_csv("telecom_data_better_colnames.csv")

In [109]:
#data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

Unnamed: 0,avg_mthly_rev,avg_no_of_mthly_mins,avg_total_mthly_recurring_charge,avg_no_of_directory_assisted_calls,avg_overage_mins,avg_overage_rev,avg_rev_of_voice_overage,avg_rev_of_data_overage,avg_no_of_roaming_calls,percntg_change_in_mthly_mins_vs_previous_three_month_avg,...,foreign_travel_dummy_variable,ethnicity_roll_to_up_code,child_0_to_2_yrs_in_house,child_3_to_5_yrs_in_house,child_6_to_10_yrs_in_house,child_11_to_15_yrs_in_house,child_16_to_17_yrs_in_house,credit_card_indicator,no_of_days_of_current_equipment,cust_id
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,-157.25,...,0.0,N,U,U,U,U,U,Y,,1000001
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,532.25,...,0.0,Z,U,U,U,U,U,Y,240.0,1000002
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,-4.25,...,0.0,N,U,Y,U,U,U,Y,1504.0,1000003
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,...,0.0,U,Y,U,U,U,U,Y,1812.0,1000004
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,38.5,...,0.0,I,U,U,U,U,U,Y,434.0,1000005


In [110]:
data.shape

(100000, 100)

## Data Wrangling - following issues to be addressed
    - 'geographic_area' : names can be made better to avoid errors (learnt after making mistakes initially)
    - drop rows with outliers 
    - drop columns with very high 'nan' / blanks
    - drop columns like 'child_between_0_to_2_years_in_house' 

In [112]:
data["geogrpahic_area"].unique()

array(['NORTHWEST/ROCKY MOUNTAIN AREA', 'CHICAGO AREA',
       'GREAT LAKES AREA', 'NEW ENGLAND AREA', 'DALLAS AREA',
       'CENTRAL/SOUTH TEXAS AREA', 'TENNESSEE AREA', 'MIDWEST AREA',
       'PHILADELPHIA AREA', 'OHIO AREA', 'HOUSTON AREA', 'SOUTHWEST AREA',
       'NEW YORK CITY AREA', 'ATLANTIC SOUTH AREA', 'SOUTH FLORIDA AREA',
       'CALIFORNIA NORTH AREA', 'DC/MARYLAND/VIRGINIA AREA',
       'NORTH FLORIDA AREA', nan, 'LOS ANGELES AREA'], dtype=object)

In [113]:
geo_area = []
for i in range(len(data)):
    item = data["geogrpahic_area"][i]
    #print(item)
    if type(item) is float:
        geo_area.append(item)
        #print(item)
    else:
        name = item.replace(' ', '_')
        name = name.replace('/', '_')
        geo_area.append(name)

In [114]:
data.drop("geogrpahic_area", inplace=True, axis=1)

In [115]:
data["geogrpahic_area"] = geo_area

In [116]:
data["geogrpahic_area"].unique()

array(['NORTHWEST_ROCKY_MOUNTAIN_AREA', 'CHICAGO_AREA',
       'GREAT_LAKES_AREA', 'NEW_ENGLAND_AREA', 'DALLAS_AREA',
       'CENTRAL_SOUTH_TEXAS_AREA', 'TENNESSEE_AREA', 'MIDWEST_AREA',
       'PHILADELPHIA_AREA', 'OHIO_AREA', 'HOUSTON_AREA', 'SOUTHWEST_AREA',
       'NEW_YORK_CITY_AREA', 'ATLANTIC_SOUTH_AREA', 'SOUTH_FLORIDA_AREA',
       'CALIFORNIA_NORTH_AREA', 'DC_MARYLAND_VIRGINIA_AREA',
       'NORTH_FLORIDA_AREA', nan, 'LOS_ANGELES_AREA'], dtype=object)

In [117]:
data.shape

(100000, 100)

### Creating a dataset by deleting all 'nan' / 'blanks'  --> original_data_without_NA.csv

In [118]:
original_data_without_NA = data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [119]:
original_data_without_NA = original_data_without_NA.reset_index()
original_data_without_NA.drop('index', axis = 1, inplace=True) 

In [120]:
original_data_without_NA.shape

(26704, 100)

In [121]:
# Saving to a file

#original_data_without_NA.to_csv('original_data_without_NA.csv')

#### no_of_unique_subscribers_in_the_house 
    - This feature has numeric discrete values and some outliers like 196 unique users in a house.
    - It's either a wrong entry or a 'close user group' (CUG) scenario of an organisation. 
    - Hence, dropping any entry with more than 20 subscribers in the house (making concessions for a hostel)

In [122]:
data['no_of_unique_subscribers_in_the_house'].unique()

array([  2,   1,   3,   5,   4,   6,   7,   9,   8,  12,  11,  10,  13,
        18, 196], dtype=int64)

In [123]:
for i in range(len(data)): 
    
    value = data['no_of_unique_subscribers_in_the_house'][i]
       
    if value > 20:
        data.drop(data.index[i], inplace = True)   
data = data.reset_index()
data.drop('index', axis = 1, inplace=True)

In [124]:
for i in range(len(data)): 
    
    value_2 = data['no_of_active_subscribers_in_house'][i]
       
    if value_2 > 20:
        data.drop(data.index[i], inplace = True)    
        
data = data.reset_index()
data.drop('index', axis = 1, inplace=True)   

In [125]:
data.shape

(99999, 100)

### Dropping features with large missing values
    - 'known_no_of_vehicles' - 49.3% values missing (refer the data exploration notebook)

In [126]:
data.drop('known_no_of_vehicles', inplace=True, axis = 1)

In [127]:
data.drop('cust_id', inplace=True, axis = 1)

In [128]:
data.shape

(99999, 98)

### FEATURES WITH 'U'  : Assuming 'U' to be 'Unknown' or 'Missing Value'

    - new_cell_phone_user : 66.9% 'U' values, which are as good as missing values
    - dualband :  "Y", "N", "T", "U".  Where 'U' is only 0.02%

In [129]:
data.new_cell_phone_user.value_counts()

U    66913
Y    19301
N    13785
Name: new_cell_phone_user, dtype: int64

In [130]:
data.drop('new_cell_phone_user', inplace=True, axis = 1)

In [131]:
# Need to decide - what to do with 'U'
data.dualband.value_counts()

Y    72264
N    23196
T     4316
U      222
Name: dualband, dtype: int64

In [132]:
data['dualband'] = data['dualband'].str.replace('U','Y')

In [133]:
# need to decide what to do with 'UNKW'
data.handset_web_capability.value_counts()

WCMB    75732
WC      13843
UNKW      235
Name: handset_web_capability, dtype: int64

In [134]:
data['handset_web_capability'] = data['handset_web_capability'].str.replace('UNKW', 'WCMB')

In [135]:
data.handset_web_capability.value_counts()

WCMB    75967
WC      13843
Name: handset_web_capability, dtype: int64

In [136]:
# Leaving 'U' as such because no information is available about the categories.
data.marital_status.value_counts()

U    37332
M    31052
S    17627
B     7116
A     5140
Name: marital_status, dtype: int64

### Dropping five columns of 'children in house' as data very sparse and not clear

In [137]:
list_of_children = ['child_0_to_2_yrs_in_house', 'child_3_to_5_yrs_in_house',
                   'child_6_to_10_yrs_in_house', 'child_11_to_15_yrs_in_house',
                   'child_16_to_17_yrs_in_house']

In [138]:
#redundant_unknown_features = ["truck_indicator", "rv_indicator", "foreign_travel_dummy_variable", "infobase_match"]

In [139]:
def drop_feature(drop_list):
    for feature in drop_list:
        data.drop(feature, inplace=True, axis = 1)

In [140]:
drop_feature(list_of_children)

In [141]:
#drop_feature(redundant_unknown_features)

In [142]:
data.shape

(99999, 92)

### Now that we have removed some redundant columns, lets remove all 'nan' and create another dataset for analysis - dataset_without_NA_2.csv

In [143]:
dataset_without_NA_2 = data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [144]:
dataset_without_NA_2 = dataset_without_NA_2.reset_index()
dataset_without_NA_2.drop('index', axis = 1, inplace=True)

In [145]:
dataset_without_NA_2.shape

(36132, 92)

In [146]:
# Saving to a file

#dataset_without_NA_2.to_csv("dataset_without_NA_2.csv")

### Moving towards Separating columns which appear numeric (continuous), but are actually factors or discrete numbers

In [147]:
numeric_df = data.corr()

In [148]:
list_numeric_cols = list(numeric_df.columns)
data[list_numeric_cols].describe()

Unnamed: 0,avg_mthly_rev,avg_no_of_mthly_mins,avg_total_mthly_recurring_charge,avg_no_of_directory_assisted_calls,avg_overage_mins,avg_overage_rev,avg_rev_of_voice_overage,avg_rev_of_data_overage,avg_no_of_roaming_calls,percntg_change_in_mthly_mins_vs_previous_three_month_avg,...,current_handset_price,no_of_handsets_issued,no_of_models_issued,truck_indicator,rv_indicator,length_of_residence,no_of_adults_in_house,estimated_income,foreign_travel_dummy_variable,no_of_days_of_current_equipment
count,99642.0,99642.0,99642.0,99642.0,99642.0,99642.0,99642.0,99642.0,99642.0,99108.0,...,99152.0,99998.0,99998.0,98267.0,98267.0,69810.0,76981.0,74564.0,98267.0,99997.0
mean,58.719894,513.565091,46.178746,0.888837,41.072659,13.559696,13.295195,0.26132,1.286418,-13.933959,...,101.875479,1.787126,1.545831,0.188822,0.082581,6.177238,2.530326,5.783112,0.057975,391.932238
std,46.291901,525.168255,23.623288,2.177628,97.296552,30.501008,30.056211,3.126547,14.711447,276.088898,...,61.005913,1.313981,0.898398,0.391369,0.275249,4.735267,1.452819,2.182132,0.233697,256.484711
min,-6.1675,0.0,-26.915,0.0,0.0,0.0,0.0,0.0,0.0,-3875.0,...,9.989998,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,-5.0
25%,33.26,150.75,30.0,0.0,0.0,0.0,0.0,0.0,0.0,-87.0,...,29.98999,1.0,1.0,0.0,0.0,2.0,1.0,4.0,0.0,212.0
50%,48.195,355.625,44.99,0.2475,2.75,1.0,0.6825,0.0,0.0,-6.25,...,99.98999,1.0,1.0,0.0,0.0,5.0,2.0,6.0,0.0,342.0
75%,70.75,703.0,59.99,0.99,42.0,14.4375,14.025,0.0,0.235,63.0,...,149.98999,2.0,2.0,0.0,0.0,9.0,3.0,7.0,0.0,530.0
max,3843.2625,12206.75,409.99,159.39,4320.75,1102.4,896.0875,423.54,3685.2,31219.25,...,499.98999,28.0,16.0,1.0,1.0,15.0,6.0,9.0,1.0,1823.0


### Complete List of numeric features which are not continuous

    - List used for imputing with 'mode' for these features

In [149]:
list_of_numeric_factors = ["no_of_unique_subscribers_in_the_house",
                           "no_of_active_subscribers_in_house",
                           "no_of_handsets_issued",
                           "no_of_models_issued",
                           "no_of_adults_in_house",  
                           "estimated_income",
                           "length_of_residence", 
                           "truck_indicator",
                           "rv_indicator",
                           "foreign_travel_dummy_variable"]

In [150]:
numeric_features_set = set(list_numeric_cols) - set(list_of_numeric_factors)
numeric_features = list(numeric_features_set)
len(numeric_features)

67

In [151]:
categorical_features_set = set(list(data.columns)) - set(list_numeric_cols)
categorical_features = list(categorical_features_set)
len(categorical_features)

15

In [152]:
cat_required = ["AA", "A", "BA", "CA", "EA"]
cat_with_others = []

for i in range(len(data)):
    cat = data.credit_class_code[i]
    
    if cat in cat_required:
        cat_with_others.append(cat)
        
    if cat not in cat_required:
        cat_with_others.append("others")
    

In [153]:
data.credit_class_code = cat_with_others

In [154]:
ethnicity_code = []
ethnicity_code_required = ["X", "P", "M", "R", "D", "B", "F"]

for i in range(len(data)):
    cat = data.ethnicity_roll_to_up_code[i]
    
    if cat in ethnicity_code_required:
        ethnicity_code.append("others")
        
    if cat not in ethnicity_code_required:
        ethnicity_code.append(cat)
        

In [155]:
data['ethnicity_roll_to_up_code'] = ethnicity_code

In [156]:
data['ethnicity_roll_to_up_code'].unique()

array(['N', 'Z', 'U', 'I', 'S', 'others', 'J', 'H', 'G', 'O', nan, 'C'],
      dtype=object)

In [157]:
data.credit_class_code.unique()

array(['A', 'EA', 'others', 'BA', 'CA', 'AA'], dtype=object)

In [158]:
data.shape

(99999, 92)

In [84]:
from sklearn.preprocessing import PowerTransformer

In [185]:
pt = PowerTransformer(method='yeo-johnson')   
# default is 'yeo-johnson'  or  one could specify ‘box-cox’, which, only works with strictly positive values

In [104]:
# An Example
data_example = [[1, 2], [3, 2], [4, 5]]

In [105]:
print(pt.fit(data_example))

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)


In [106]:
print(pt.lambdas_)

[1.38668178e+00 5.93926346e-09]


In [107]:
print(pt.transform(data_example))

[[-1.31616039 -0.70710678]
 [ 0.20998268 -0.70710678]
 [ 1.1061777   1.41421356]]


#### Now applying Yeo - Johnson to numeric_df with 66 columns

In [186]:
pt.fit(numeric_df)

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [187]:
pt.lambdas_

array([1.28986050e-01, 5.44486940e-01, 3.07388701e-01, 1.76777193e-11,
       2.06223386e-01, 3.60996838e-02, 4.06526191e-10, 3.36604396e-10,
       3.13917026e-01, 2.22507940e-01, 4.38028786e-10, 4.93717304e-11,
       1.51281587e-01, 3.04441576e-01, 2.71644928e-01, 4.08674587e-10,
       2.81383473e-01, 1.36754133e-01, 2.89284906e-01, 2.94009648e-01,
       1.04257333e-02, 3.66248260e-02, 6.54754060e-02, 3.44317410e-11,
       1.55823061e-10, 3.44317410e-11, 1.41014893e-11, 1.87551127e-12,
       9.16289314e-01, 2.02793097e-01, 3.44317410e-11, 1.49424727e-01,
       3.99534495e-10, 5.59934839e-01, 1.76770243e-01, 3.01773427e-01,
       2.54951513e-01, 1.73703740e-10, 2.02600563e-09, 5.65392930e-02,
       5.54974060e-11, 3.52238343e-10, 2.84936540e-01, 3.04916169e-02,
       3.76379860e-09, 2.69343337e-10, 1.51449830e-09, 3.28345967e-08,
       8.74639986e-11, 3.44317410e-11, 1.03628886e-01, 1.44560415e-01,
       1.77224321e-10, 1.27490477e-01, 3.44317410e-11, 4.98730714e-01,
      

In [188]:
yeo_johnson_data = pt.transform(numeric_df)

In [190]:
len(yeo_johnson_data)

99999

In [191]:
yeo_johnson_data_df = pd.DataFrame(data=yeo_johnson_data)

In [192]:
yeo_johnson_data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,-1.066156,0.826798,-0.32882,-0.448723,-0.209734,-1.54791,-0.069862,-1.052027,-0.413039,-1.206385,...,-0.677761,-1.240796,-0.140061,-0.415984,-0.518636,-0.219844,-0.741817,-0.454277,-0.325282,-0.185105
1,0.243078,,0.929355,-0.448723,1.854992,0.525417,-0.069862,-0.076471,0.960921,0.842557,...,1.070984,0.309002,-0.140061,0.954359,1.862811,-0.219844,-0.299461,-0.267534,0.937005,1.211198
2,-1.535475,-1.250579,-1.320873,-0.448723,1.747181,-1.54791,-0.069862,-1.052027,-1.273309,0.114208,...,-0.677761,0.199245,-0.140061,-1.274621,0.077599,-0.219844,-1.592736,-0.977491,-1.319491,-1.505747
3,-0.399079,-1.250579,-1.469194,-0.448723,-0.471355,-1.408959,-0.069862,-1.052027,-1.538104,-1.250859,...,-0.677761,-1.423545,-0.140061,-1.539066,0.087814,-0.219844,-0.378824,-0.977491,-1.468012,-1.124656
4,0.179097,0.826798,0.578185,-0.448723,1.060499,0.016563,-0.069862,-0.855021,0.781651,-0.457148,...,1.656787,-0.359494,-0.140061,0.775613,0.226827,-0.219844,0.197655,-0.454277,0.584568,0.74918


In [193]:
yeo_johnson_data_df.to_csv("yeo_johnson_data_df.csv")

###  THIS NOTEBOOK ENDS HERE 