In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import math

import wrangle
import env

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

In [47]:
df = wrangle.wrangle_telco()

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1685 non-null object
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
total_charges      1685 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 145.8+ KB


In [68]:
df = df.set_index('customer_id')

In [69]:
#df = df.reindex().drop('customer_id', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1685 entries, 0013-SMEOE to 9995-HOTOH
Data columns (total 3 columns):
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
total_charges      1685 non-null float64
dtypes: float64(2), int64(1)
memory usage: 52.7+ KB


In [70]:
train, test = train_test_split(df, train_size = .80, random_state = 123)

In [71]:
print(train.info())
train.sample(10)

<class 'pandas.core.frame.DataFrame'>
Index: 1348 entries, 0707-HOVVN to 9108-EQPNQ
Data columns (total 3 columns):
monthly_charges    1348 non-null float64
tenure             1348 non-null int64
total_charges      1348 non-null float64
dtypes: float64(2), int64(1)
memory usage: 42.1+ KB
None


Unnamed: 0_level_0,monthly_charges,tenure,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8807-OPMBM,25.65,55,1388.0
8766-PAFNE,79.1,71,5564.85
1150-WFARN,108.75,67,7156.2
5680-LQOGP,82.45,68,5646.6
2804-ETQDK,20.55,66,1357.1
4475-NVTLU,19.2,45,903.7
9880-TDQAC,60.0,72,4264.0
2516-XSJKX,78.45,41,3126.45
8591-TKMZH,111.1,59,6555.2
6950-TWMYB,79.95,54,4362.05


In [72]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 337 entries, 1927-QEWMY to 7815-PDTHL
Data columns (total 3 columns):
monthly_charges    337 non-null float64
tenure             337 non-null int64
total_charges      337 non-null float64
dtypes: float64(2), int64(1)
memory usage: 10.5+ KB


In [73]:
def standard_scaler(train, test):
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled

In [74]:
scaler_standard, train_scaled, test_scaled = standard_scaler(train, test)

In [76]:
train_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
1970-KKFWL,-1.082812,-1.277216,-1.14501
8414-OOEEL,0.534735,0.500083,0.566221
8402-OOOHJ,-1.159084,-0.933222,-1.130209
3070-BDOQC,1.114692,0.15609,0.85601
7608-RGIRO,-1.051152,0.270754,-0.905422
1052-QJIBV,-1.18067,0.786745,-0.91153
9114-DPSIA,0.577908,0.844077,0.78169
8590-YFFQO,0.115957,0.729412,0.304479
5442-PPTJY,-1.186427,-2.595857,-1.354587
8966-SNIZF,-1.193622,0.729412,-0.948018


In [79]:
test_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
4641-FROLU,-1.19794,-1.563877,-1.220555
7570-WELNY,0.684401,0.614748,0.766539
4335-UPJSI,-1.184988,-1.90787,-1.261323
6733-LRIZX,1.382364,-1.277216,0.110852
0343-QLUZP,-0.613666,0.15609,-0.511944
8071-SBTRN,1.445684,0.729412,1.554856
7729-XBTWX,-1.184988,0.500083,-0.932069
6999-CHVCF,1.185208,0.614748,1.326044
6177-PEVRA,-0.156032,-0.531897,-0.433035
6609-MXJHJ,-1.031004,-1.563877,-1.147947


In [84]:
type(scaler_standard)

sklearn.preprocessing.data.StandardScaler

In [89]:
def scale_inverse(train_scaled, test_scaled, scaler):
        train_unscaled = pd.DataFrame(scaler.inverse_transform(train_scaled), columns=train_scaled.columns.values).set_index([train.index.values])
        test_unscaled = pd.DataFrame(scaler.inverse_transform(test_scaled), columns=test_scaled.columns.values).set_index([test.index.values])
        return train_unscaled, test_unscaled

In [91]:
train_unscaled, test_unscaled = scale_inverse(train_scaled, test_scaled, scaler_standard)

In [95]:
train_unscaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
2351-BKRZW,75.2,43.0,3254.35
6614-YWYSC,25.0,61.0,1501.75
5322-TEUJK,114.6,71.0,8100.25
6813-GZQCG,24.65,45.0,1171.3
4883-KCPZJ,25.25,22.0,555.4
0320-DWVTU,99.5,53.0,5424.25
1302-TPUBN,19.35,66.0,1240.8
1447-PJGGA,95.25,57.0,5464.65
2097-YVPKN,25.75,65.0,1654.75
8859-DZTGQ,20.35,33.0,689.75


In [96]:
def uniform_scaler(train, test):
    scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled

In [97]:
scaler_standard, train_scaled, test_scaled = uniform_scaler(train, test)

In [98]:
train_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
8039-EQPIM,0.469885,0.641414,0.534751
2612-PHGOX,0.586113,0.489899,0.617428
0707-HOVVN,0.58071,0.686869,0.657682
5456-ITGIC,0.500606,1.0,0.594095
8069-RHUXK,0.381647,0.575758,0.427976
5181-OABFK,0.479141,0.338384,0.470799
9802-CAQUT,0.906876,1.0,0.941057
8792-AOROI,0.513414,0.020202,0.071324
3910-MRQOY,0.030303,1.0,0.322863
7242-EDTYC,0.050505,0.191919,0.116282


In [99]:
test_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
6177-PEVRA,0.437973,0.242424,0.431037
5171-EPLKN,0.106061,0.093434,0.059535
4159-NAAIX,0.795231,0.469697,0.780306
0229-LFJAF,0.542871,1.0,0.628719
9174-IHETN,0.549313,0.262626,0.485101
6461-SZMCV,0.715849,0.752525,0.796528
8148-WOCMK,0.060606,0.020202,0.007737
4827-DPADN,0.661873,1.0,0.7544
7876-DNYAP,0.094474,0.641414,0.310601
8838-GPHZP,0.214646,0.469697,0.258593


In [100]:
def gaussian_scaler(train, test):
    scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled

In [101]:
scaler_standard, train_scaled, test_scaled = gaussian_scaler(train, test)

In [102]:
train_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
2530-ENDWQ,12.412695,12741.647495,116.824861
6386-SZZKH,13.246686,5278.438356,105.597416
5840-NVDCG,10.705596,388.906749,50.989292
1525-LNLOJ,10.330428,10707.431111,94.751175
4807-IZYOZ,5.898182,5803.252275,48.905188
8540-ZQGEA,5.806603,6645.72529,50.719214
6479-VDGRK,11.884403,13173.537555,113.011631
3948-KXDUF,10.742563,10707.431111,97.413457
9418-RUKPH,5.791185,2895.424894,42.414328
7622-FWGEW,11.907031,7244.884373,101.15747


In [103]:
test_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
8565-WUXZU,11.851979,13173.537555,113.117728
2333-KWEWW,5.806603,508.964971,30.797685
6741-QRLUP,11.554803,9585.291088,102.802425
6621-NRZAK,5.798899,9585.291088,52.985622
8908-NMQTX,10.506876,13173.537555,98.613172
4827-DPADN,11.754008,13173.537555,112.271219
0868-VJRDR,13.025447,9951.257113,116.990336
2001-EWBQU,13.062666,8535.252443,114.740528
6199-IPCAO,6.666582,1535.680436,40.6668
3967-VQOGC,6.506178,11097.741537,61.861144


In [104]:
def min_max_scaler(train, test):
    scaler = MinMaxScaler(copy=True, feature_range=(0,1)).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled

In [105]:
scaler_standard, train_scaled, test_scaled = min_max_scaler(train, test)

In [106]:
train_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
0973-KYVNF,0.520678,1.0,0.576831
7602-MVRMB,0.917289,1.0,0.929081
4652-ODEVH,0.060289,0.619718,0.123427
9087-EYCPR,0.06577,0.830986,0.1716
8590-YFFQO,0.463876,0.971831,0.52044
9031-ZVQPT,0.894868,1.0,0.903856
3258-SYSWS,0.950673,1.0,0.904457
2300-RQGOI,0.016442,0.521127,0.08335
9053-JZFKV,0.974589,0.929577,0.89365
3133-PZNSR,0.790732,1.0,0.805729


In [107]:
test_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
5696-CEIQJ,0.844544,0.929577,0.794622
3352-RICWQ,0.013453,0.112676,0.021995
5208-FVQKB,0.493772,0.971831,0.536725
7134-HBPBS,0.893871,1.0,0.896164
0771-CHWSK,0.56004,0.915493,0.552242
7609-YBPXG,0.684604,0.746479,0.544937
8265-HKSOW,0.887394,1.0,0.882786
8148-WOCMK,0.011958,0.098592,0.012095
7729-XBTWX,0.013453,0.915493,0.153044
2595-KIWPV,0.012955,0.366197,0.056512


In [108]:
def iqr_robust_scaler(train, test):
    scaler = RobustScaler(quantile_range=(25.0,75.0), copy=True, with_centering=True, with_scaling=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled

In [109]:
scaler_standard, train_scaled, test_scaled = iqr_robust_scaler(train, test)

In [110]:
train_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
2030-BTZRO,-0.658683,-2.636364,-0.77002
2230-XTUWL,-0.671407,-0.727273,-0.605331
3563-SVYLG,-0.026198,0.181818,0.136103
5220-AGAAX,-0.60479,0.181818,-0.439538
1813-JYWTO,0.240269,0.363636,0.425208
5995-LFTLE,-0.585329,-0.272727,-0.479864
3174-AKMAS,-0.002994,-0.818182,-0.153957
2190-PHBHR,0.44985,0.363636,0.676588
1302-TPUBN,-0.674401,0.090909,-0.529445
5893-PYOLZ,0.260479,-0.363636,0.169784


In [111]:
test_scaled.sample(10)

Unnamed: 0,monthly_charges,tenure,total_charges
5320-BRKGK,0.170659,-0.272727,0.144584
0769-MURVM,0.019461,0.363636,0.178467
7267-FRMJW,-0.663174,-1.772727,-0.689676
1354-YZFNB,-0.671407,-2.727273,-0.778257
4335-UPJSI,-0.668413,-1.818182,-0.687118
4137-JOPHL,0.378743,-0.636364,0.120966
6917-FIJHC,-0.57485,0.363636,-0.415825
9068-FHQHD,-0.662425,-1.090909,-0.627834
6953-PBDIN,0.127994,0.272727,0.324994
6695-AMZUF,0.33009,0.272727,0.470342
