In [None]:
# Import the dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

In [None]:
# Loading the dataset
df= pd.read_csv('Mall_Customers.csv')
df

In [None]:
df.shape

In [None]:
# Checking the info
df.info()

We're having numerical variables except Gender which is a categorical variable.

In [None]:
df.nunique()

In [None]:
df.describe()

In [None]:
df.columns

Missing Values

In [None]:
# Checking for Missing values
df.isnull().sum()

There are no missing values in this dataset. So lets move on to Exploratory Data Analysis

In [None]:
df.Gender.value_counts()

In [None]:
sns.countplot('Gender', data= df)

In [None]:
plt.pie(df['Gender'].value_counts(), labels=(df.Gender.value_counts().index), autopct=('%1.1f%%'))

In [None]:
plt.figure(figsize=(16,9))
sns.countplot(df['Age'], hue='Gender',data=df)
plt.xticks(rotation=90)
plt.title("AGE VS GENDER")

In [None]:
plt.figure(figsize=(17,9))
sns.countplot('Spending Score (1-100)', hue= 'Gender', data= df)
plt.xticks(rotation=90)
plt.title("SPENDING SCORE (1-100) VS GENDER")

In [None]:
plt.figure(figsize=(17,9))
sns.countplot('Annual Income (k$)', hue= 'Gender', data= df)
plt.xticks(rotation=90)
plt.title("ANNUAL INCOME (k$) VS GENDER")

Let's take a look at the Relationship between these Variables

In [None]:
sns.relplot(x=("Annual Income (k$)"),y=("Spending Score (1-100)"), hue='Gender', data=df)
plt.xticks(rotation=90)
plt.title("Relationships between Variable")

Let's look at the correlation features

In [None]:
# Converting the Gender object type to numeric
# Male=1 and Female=0
df['Gender'] = df.Gender.astype('category').cat.codes

In [None]:
corr = df.corr()

In [None]:
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True)

In [None]:
sns.pairplot(df)

### Customer Segmentation

Let's divide this variables into dependent variables and independent variable

In [None]:
df

In [None]:
X = df.iloc[: ,2:4].values
y = df.iloc[: , 4].values

In [None]:
from sklearn.cluster import KMeans
wcss = []

In [None]:
for i in range(1,11):
    Kmeans = KMeans(n_clusters=i,init='k-means++',random_state=0)
    Kmeans.fit(X)
    wcss.append(Kmeans.inertia_)

In [None]:
plt.plot(range(1,11), wcss)

Model Build

In [None]:
kmeans_model = KMeans(n_clusters=5, init='k-means++', random_state=0)
y_kmeans = kmeans_model.fit_predict(X)

In [None]:
plt.scatter(X[y_kmeans == 0,0], X[y_kmeans == 0,1], s=80, c='red', label='customer 1')
plt.scatter(X[y_kmeans == 1,0], X[y_kmeans == 1,1], s=80, c= 'blue', label='customer 2')
plt.scatter(X[y_kmeans == 2,0], X[y_kmeans == 2,1], s=80, c='green', label='customer 3')
plt.scatter(X[y_kmeans == 3,0], X[y_kmeans == 3,1], s=80, c='purple', label='customer 4')
plt.scatter(X[y_kmeans == 4,0], X[y_kmeans == 4,1], s=80, c='cyan', label='customer 5')
plt.scatter(Kmeans.cluster_centers_[:,0], Kmeans.cluster_centers_[:,1], s=100, c='black', label='centroids')
plt.title('CLUSTERS OF CUSTOMERS')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')