In [34]:
#basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import env

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


## Acquire data from the customers table in the mall_customers database.


In [4]:

def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

def get_mallcustomer_data():
    '''
    Reads in all fields from the customers table in the mall_customers schema from data.codeup.com
    
    parameters: None
    
    returns: a single Pandas DataFrame with the index set to the primary customer_id field
    '''
    df = pd.read_sql('SELECT * FROM customers;', get_connection('mall_customers'))
    return df.set_index('customer_id')

In [5]:
df = get_mallcustomer_data()

Summarize the data (include distributions and descriptive statistics).


In [6]:
# examine row by row basics:

def nulls_by_row(df):
    num_missing = df.isnull().sum(axis=1)
    prnt_miss = num_missing / df.shape[1] * 100
    rows_missing = pd.DataFrame({'num_cols_missing': num_missing, 'percent_cols_missing': prnt_miss})
    rows_missing = rows_missing.reset_index().groupby(['num_cols_missing', 'percent_cols_missing']).count().reset_index().\
        rename(columns={'customer_id':'count'})

    return rows_missing

In [7]:
def nulls_by_col(df):
    num_missing= df.isnull().sum()
    percnt_miss = num_missing / df.shape[0] * 100
    cols_missing = pd.DataFrame(
        {
            'num_rows_missing': num_missing,
            'percent_rows_missing': percnt_miss
        }
    )
    return cols_missing

In [8]:
def summarize(df):
    print('DaataFrame head:\n')
    print(df.head().to_markdown())
    print('-----')
    print('DataFrame info:\n')
    print (df.info())
    print('---')
    print('DataFrame describe:\n')
    print (df.describe())
    print('---')
    print('DataFrame null value asssessment:\n')
    print('Nulls By Column:', nulls_by_col(df))
    print('----')
    print('Nulls By Row:', nulls_by_row(df))
    numerical_cols = [col for col in df.columns if df[col].dtype !='O']
    categorical_cols = [col for col in df.columns if col not in numerical_cols]
    print('value_counts: \n')
    for col in df.columns:
        print(f'Column Names: {col}')
        if col in categorical_cols:
            print(df[col].value_counts())
        else:
            print(df[col].value_counts(bins=10, sort=False))
            print('---')
    print('Report Finished')
    return

In [9]:
summarize(df)

DaataFrame head:

|   customer_id | gender   |   age |   annual_income |   spending_score |
|--------------:|:---------|------:|----------------:|-----------------:|
|             1 | Male     |    19 |              15 |               39 |
|             2 | Male     |    21 |              15 |               81 |
|             3 | Female   |    20 |              16 |                6 |
|             4 | Female   |    23 |              16 |               77 |
|             5 | Female   |    31 |              17 |               40 |
-----
DataFrame info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   gender          200 non-null    object
 1   age             200 non-null    int64 
 2   annual_income   200 non-null    int64 
 3   spending_score  200 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 7.8+ KB
None
---
DataFrame desc

Detect outliers using IQR.


In [10]:
def get_upper_outliers(s, k=1.5):
    q1, q3 = s.quantile([0.25, 0.75])
    iqr = q3 - q1
    upper_bound = q3 + k*iqr

    return s.apply(lambda x: max([x - upper_bound, 0]))

In [11]:
def add_upper_outlier_columns(df, k=1.5):
    for col in df.select_dtypes('number'):
        df[col + '_upper_outliers'] = get_upper_outliers(df[col], k)
    return df

In [15]:
df = add_upper_outlier_columns(df)

In [18]:
df.head()

Unnamed: 0_level_0,gender,age,annual_income,spending_score,age_upper_outliers,annual_income_upper_outliers,spending_score_upper_outliers
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Male,19,15,39,0,0.0,0
2,Male,21,15,81,0,0.0,0
3,Female,20,16,6,0,0.0,0
4,Female,23,16,77,0,0.0,0
5,Female,31,17,40,0,0.0,0


In [22]:
outlier_cols = [col for col in df.columns if col.endswith('_outliers')]
for col in outlier_cols:
    print(col, ':\n')
    subset = df[col][df[col] > 0] 
    print(subset.describe())

age_upper_outliers :

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: age_upper_outliers, dtype: float64
annual_income_upper_outliers :

count    2.00
mean     4.25
std      0.00
min      4.25
25%      4.25
50%      4.25
75%      4.25
max      4.25
Name: annual_income_upper_outliers, dtype: float64
spending_score_upper_outliers :

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: spending_score_upper_outliers, dtype: float64


Split data into train, validate, and test.


In [27]:
def split_train_test_validate(df):
    train_validate, test = train_test_split(df, test_size= .2, random_state=514)
    train, validate = train_test_split(train_validate, test_size= .3, random_state=514)
    print(train.shape, validate.shape, test.shape)
    return train, validate, test

In [28]:
train, validate, test = split_train_test_validate(df)

(112, 7) (48, 7) (40, 7)


Encode categorical columns using a one hot encoder (pd.get_dummies).


In [29]:
categorical_cols = df.select_dtypes(exclude='number')


In [31]:
df = pd.get_dummies(df)

Handles missing values.


In [32]:
def handle_missing_values(df, prop_required_columns=0.5, prop_required_row=0.75):
    threshold = int(round(prop_required_columns * len(df.index), 0))
    df = df.dropna(axis=1, thresh=threshold)
    threshold = int(round(prop_required_row * len(df.index), 0))
    df = df.dropna(axis=0, thresh=threshold)

    return df

In [33]:
handle_missing_values(df)

Scaling


In [35]:
def scale_split_data (train, validate, test):
    train_scaled = MinMaxScaler(train)
    validate_scaled = MinMaxScaler(validate)
    test_scaled = MinMaxScaler(test)

    return train_scaled, validate_scaled, test_scaled

In [36]:
train_scaled, validate_scaled, test_scaled = scale_split_data(train, validate, test)