## Mini Project III

In [197]:
import pandas as pd
import numpy as np
import seaborn as sns

import sklearn
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


# Cleaning

Create two separate customer segmentations (using clustering) to split them into 3-5 clusters

In [198]:
# Get data for customers

customers = pd.read_csv('../data/twm_customer.csv', delimiter=';')


In [199]:
customers.head()

Unnamed: 0,cust_id,income,age,years_with_bank,nbr_children,gender,marital_status,name_prefix,first_name,last_name,street_nbr,street_name,postal_code,city_name,state_code
0,1362691,26150,46,5,1,M,2,,Donald ...,Marek ...,8298,Second ...,89194,Las Vegas,NV
1,1362487,6605,71,1,0,M,2,,ChingDyi ...,Moussavi ...,10603,Daffodil ...,90159,Los Angeles,CA
2,1363160,18548,38,8,0,F,1,,Rosa ...,Johnston ...,8817,Figueroa ...,90024,Los Angeles,CA
3,1362752,47668,54,3,0,F,1,,Lisa ...,Martin ...,676,Humble ...,90172,Los Angeles,CA
4,1362548,44554,59,9,2,F,4,,Barbara ...,O'Malley ...,6578,C ...,10138,New York City,NY


In [200]:
# remove useless columns or lots of null values
customers = customers.drop(['name_prefix', 'first_name', 'last_name', 'street_name', 'street_nbr', 'postal_code', 'state_code'], axis=1)
customers.describe()

Unnamed: 0,cust_id,income,age,years_with_bank,nbr_children,marital_status
count,747.0,747.0,747.0,747.0,747.0,747.0
mean,1362991.0,22728.281124,42.47925,3.907631,0.714859,1.882195
std,292.5255,22207.221405,19.114879,2.675634,1.10341,0.892051
min,1362480.0,0.0,13.0,0.0,0.0,1.0
25%,1362732.0,7118.5,28.0,2.0,0.0,1.0
50%,1362993.0,17242.0,42.0,4.0,0.0,2.0
75%,1363244.0,31338.0,56.0,6.0,1.0,2.0
max,1363495.0,144157.0,89.0,9.0,5.0,4.0


In [201]:
# Check for duplicate ids and remove them
idsUnique = len(set(customers.cust_id))
idsTotal = customers.shape[0]
idsdupe = idsTotal - idsUnique
print(idsdupe)



0


In [202]:
# missing data
def checkNull(data):
    """Check if data in frame is null"""
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['total', 'percent'])
    return missing_data.head(20)

missing_data = checkNull(customers)
missing_data

Unnamed: 0,total,percent
cust_id,0,0.0
income,0,0.0
age,0,0.0
years_with_bank,0,0.0
nbr_children,0,0.0
gender,0,0.0
marital_status,0,0.0
city_name,0,0.0


# Scaling

In [203]:
# drop cust_id
customers = customers.drop('cust_id', axis=1)

In [204]:
# customers['marital_status'] = customers['marital_status'].astype('category')
# customers

In [205]:


# Scaling
from sklearn.preprocessing import MinMaxScaler

num_feats = customers.dtypes[(customers.dtypes != 'object') & (customers.columns != 'marital_status')].index.tolist()
scaler = MinMaxScaler()
scaled_customers = pd.DataFrame(scaler.fit_transform(customers[num_feats].astype(float)))



In [206]:
# add columns back

scaled_customers.columns = num_feats
scaled_customers

Unnamed: 0,income,age,years_with_bank,nbr_children
0,0.181399,0.434211,0.555556,0.2
1,0.045818,0.763158,0.111111,0.0
2,0.128665,0.328947,0.888889,0.0
3,0.330667,0.539474,0.333333,0.0
4,0.309066,0.605263,1.000000,0.4
...,...,...,...,...
742,0.102631,0.302632,0.666667,0.2
743,0.183043,0.565789,0.666667,0.2
744,0.425231,0.486842,0.000000,0.4
745,0.104747,0.315789,0.777778,0.0


In [207]:
# get dummy values for marital status

segmentation_cols = ['income','age','years_with_bank','nbr_children','gender', 'marital_status']

dummy_marital = pd.get_dummies(customers.marital_status)
scaled_customers = pd.concat([scaled_customers,dummy_marital], axis=1)


In [208]:
# scaled_customers.rename(columns={'1':'Marital_1','2':'Marital_2','3':'Marital_3','4':'Marital_4'}, inplace=True)
scaled_customers.rename(columns={1:'Marital_1',2:'Marital_2',3:'Marital_3',4:'Marital_4'}, inplace=True)


In [209]:
scaled_customers

Unnamed: 0,income,age,years_with_bank,nbr_children,Marital_1,Marital_2,Marital_3,Marital_4
0,0.181399,0.434211,0.555556,0.2,0,1,0,0
1,0.045818,0.763158,0.111111,0.0,0,1,0,0
2,0.128665,0.328947,0.888889,0.0,1,0,0,0
3,0.330667,0.539474,0.333333,0.0,1,0,0,0
4,0.309066,0.605263,1.000000,0.4,0,0,0,1
...,...,...,...,...,...,...,...,...
742,0.102631,0.302632,0.666667,0.2,0,0,0,1
743,0.183043,0.565789,0.666667,0.2,0,1,0,0
744,0.425231,0.486842,0.000000,0.4,0,1,0,0
745,0.104747,0.315789,0.777778,0.0,0,1,0,0


# Encoding

In [210]:
# # get dummy values for cities

# dummy_cities = pd.get_dummies(customers.city_name)
# scaled_customers = pd.concat([scaled_customers,dummy_cities], axis=1)

In [211]:
# scaled_customers

# All of them in a PCA

In [212]:
# def screePlot(data, n_components=10, title='Scree Plot'):
#     pca = PCA(n_components=n_components)
#     pca_fit = pca.fit(data)
#     pca_fit

#     PC_values = np.arange(pca.n_components_) + 1
#     PC_values

#     plt.plot(PC_values, pca.explained_variance_ratio_, 'o-')
#     plt.title(title)
#     plt.xlabel('Principal Component')
#     plt.ylabel('Variance Explained')

# screePlot(scaled_customers, title='Scree Plot w geo')

In [213]:
scaled_customers.iloc[:,:4]

Unnamed: 0,income,age,years_with_bank,nbr_children
0,0.181399,0.434211,0.555556,0.2
1,0.045818,0.763158,0.111111,0.0
2,0.128665,0.328947,0.888889,0.0
3,0.330667,0.539474,0.333333,0.0
4,0.309066,0.605263,1.000000,0.4
...,...,...,...,...
742,0.102631,0.302632,0.666667,0.2
743,0.183043,0.565789,0.666667,0.2
744,0.425231,0.486842,0.000000,0.4
745,0.104747,0.315789,0.777778,0.0


In [214]:
# screePlot(scaled_customers.iloc[:,:4], n_components=4, title='Scree Plot w/o Geo')

In [215]:
#export
scaled_customers.to_csv('../data/customers_scaled.csv')

In [216]:
scaled_customers

Unnamed: 0,income,age,years_with_bank,nbr_children,Marital_1,Marital_2,Marital_3,Marital_4
0,0.181399,0.434211,0.555556,0.2,0,1,0,0
1,0.045818,0.763158,0.111111,0.0,0,1,0,0
2,0.128665,0.328947,0.888889,0.0,1,0,0,0
3,0.330667,0.539474,0.333333,0.0,1,0,0,0
4,0.309066,0.605263,1.000000,0.4,0,0,0,1
...,...,...,...,...,...,...,...,...
742,0.102631,0.302632,0.666667,0.2,0,0,0,1
743,0.183043,0.565789,0.666667,0.2,0,1,0,0
744,0.425231,0.486842,0.000000,0.4,0,1,0,0
745,0.104747,0.315789,0.777778,0.0,0,1,0,0
