In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from acquire import wrangle_zillow
from prepare import remove_outliers, train_val_test, x_y_split, mm_scaler, ss_scaler, rs_scaler

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import stats
from env import get_connection
import os

In [2]:
def get_mall_data():
    
    '''
    This function is used to get mall customers data from sql database.
    '''
    
    if os.path.isfile('mall_customers.csv'):
        
        return pd.read_csv('mall_customers.csv')
    
    else:
        
        url = get_connection('mall_customers')
        query = '''
                SELECT * FROM customers;
                '''
        df = pd.read_sql(query, url)
        df.to_csv('mall_customers.csv')
        return df

In [3]:
df = get_mall_data()

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,gender,age,annual_income,spending_score
0,0,1,Male,19,15,39
1,1,2,Male,21,15,81
2,2,3,Female,20,16,6
3,3,4,Female,23,16,77
4,4,5,Female,31,17,40


In [5]:
def mall_prep(df):
    
    df.drop(columns=['Unnamed: 0'], inplace=True)
    
    df = df.rename(columns={'annual_income':'income',
                            'spending_score':'spending'})
    
    df['total_score'] = df['income'] - df['spending']
    
    df, var_fences = remove_outliers(df)
    
    df = pd.get_dummies(df, columns=['gender'])
    
    X_train, y_train, X_val, y_val, X_test, y_test = x_y_split(df, 'total_score')
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [6]:
def mall_scaling(X_train, X_val, X_test, scaler='mm'):
    
    col_list = ['age','income','spending']
    
    if scaler == 'ss':
        X_train, X_val, X_test = ss_scaler(X_train, X_val, X_test, col_list)
    elif scaler == 'rs':
        X_train, X_val, X_test = rs_scaler(X_train, X_val, X_test, col_list)
    else:
        X_train, X_val, X_test = mm_scaler(X_train, X_val, X_test, col_list)
    
    return X_train, X_val, X_test

In [7]:
X_train, y_train, X_val, y_val, X_test, y_test = mall_prep(df)

gender column ignored


In [18]:
X_train_scaled, X_val_scaled, X_test_scaled = mall_scaling(X_train, X_val, X_test, 'ss')

In [19]:
X_train_scaled.head()

Unnamed: 0,customer_id,age,income,spending,gender_Female,gender_Male
18,19,0.888887,-1.663121,-0.918492,0,1
110,111,1.801661,0.118958,0.119581,0,1
125,126,-0.585594,0.430822,1.247921,1,0
0,1,-1.428155,-2.019537,-0.467156,0,1
138,139,-1.428155,0.60903,-1.77603,0,1


In [10]:
X_train.shape, X_val.shape, X_test.shape

((97, 6), (33, 6), (33, 6))

In [11]:
X_train.head()

Unnamed: 0,customer_id,age,income,spending,gender_Female,gender_Male
18,19,0.888887,-1.663121,-0.918492,0,1
110,111,1.801661,0.118958,0.119581,0,1
125,126,-0.585594,0.430822,1.247921,1,0
0,1,-1.428155,-2.019537,-0.467156,0,1
138,139,-1.428155,0.60903,-1.77603,0,1


In [12]:
df.rename(columns={'annual_income':'income',
                   'spending_score':'spending'})

Unnamed: 0,customer_id,gender,age,income,spending
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [13]:
df.drop(columns=['Unnamed: 0'], inplace=True)

KeyError: "['Unnamed: 0'] not found in axis"

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
for col in df.columns:
    
    plt.hist(df[col])
    plt.title(f'distribution of {col}')
    plt.show()

In [None]:
df, var_fences = remove_outliers(df)

In [None]:
df.shape

In [None]:
train, val, test = train_val_test(df)

In [None]:
train.shape, val.shape, test.shape

In [None]:
train = pd.get_dummies(train, columns=['gender'])

In [None]:
train.head()

In [None]:
train.shape