# Customer Segmentation

## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime, nltk, warnings
import matplotlib.cm as cm
import itertools
from ydata_profiling import ProfileReport
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier

from wordcloud import WordCloud, STOPWORDS
from IPython.display import display, HTML

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)

warnings.filterwarnings("ignore")

plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
%matplotlib inline

## Data Preparation

In [38]:
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object

I load the data. Once done, I also give some basic informations on the content of the dataframe: the type of the various variables, the number of null values and their percentage with respect to the total number of entries:

In [123]:
print('Dataframe dimensions:', df.shape)
df = pd.read_csv('data.csv', encoding='ISO-8859-1', dtype={'CustomerID': str,'InvoiceID': str})

#---------------
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
#---------------

#---------------
#For NULL values
null_info = pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
null_info = pd.concat([null_info, pd.DataFrame(df.isnull().sum()).T.rename(index={0:'null values (nb)'})])
null_info = pd.concat([null_info, pd.DataFrame(df.isnull().sum()/df.shape[0]*100).T.rename(index={0:'null values (%)'})])
display(null_info)
#---------------

display(df.head())

Dataframe dimensions: (3, 3)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
column type,object,object,object,int64,datetime64[ns],float64,object,object
null values (nb),0,0,1454,0,0,0,135080,0
null values (%),0.0,0.0,0.268311,0.0,0.0,0.0,24.926694,0.0


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


While looking at the number of null values in the dataframe, it is interesting to note that  ∼
 25% of the entries are not assigned to a particular customer. With the data available, it is impossible to impute values for the user and these entries are thus useless for the current exercise. So I delete them from the dataframe:

In [44]:
df.dropna(axis = 0, subset = ['CustomerID'], inplace = True)
print('Dimensions', df.shape)

#---------------
null_info = pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
null_info = pd.concat([null_info, pd.DataFrame(df.isnull().sum()).T.rename(index={0:'null values (nb)'})])
null_info = pd.concat([null_info, pd.DataFrame(df.isnull().sum()/df.shape[0]*100).T.rename(index={0:'null values (%)'})])
display(null_info)
#---------------

display(df.head())

Dimensions (406829, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
column type,object,object,object,int64,datetime64[ns],float64,object,object
null values (nb),0,0,0,0,0,0,0,0
null values (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


OK, therefore, by removing these entries we end up with a dataframe filled at 100% for all variables! Finally, I check for duplicate entries and delete them:

In [48]:
print('Duplicate entries: {}'.format(df.duplicated().sum()))
df.drop_duplicates(inplace = True)

Duplicate entries: 5225


## Exploring the contect of Variables

This dataframe contains 8 variables that correspond to:

**InvoiceNo**: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.<br>
**StockCode**: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.<br>
**Description**: Product (item) name. Nominal.<br>
**Quantity**: The quantities of each product (item) per transaction. Numeric.<br>
**InvoiceDate**: Invice Date and time. Numeric, the day and time when each transaction was generated.<br>
**UnitPrice**: Unit price. Numeric, Product price per unit in sterling.<br>
**CustomerID**: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.<br>
**Country**: Country name. Nominal, the name of the country where each customer resides.<br>

### Countries
<br>Here, I quickly look at the countries from which orders were made:

In [83]:
t1 = df[['CustomerID', 'InvoiceNo', 'Country']].groupby(['CustomerID','InvoiceNo','Country']).count()
t1 = t1.reset_index(drop = False)
countries = t1['Country'].value_counts()
print('No. of countries in the dataframe: {}'.format(len(countries)))

#---------------
#t2 = df.Country.unique()
#len(t2)
#---------------

No. of countries in the dataframe: 37


and show the result on a choropleth map:

In [100]:
data = dict(
            type = 'choropleth',
            locations = countries.index,
            locationmode = 'country names',
            text = countries.index,
            colorbar = {'title' : 'Order No. '},
            z = countries,
            colorscale=[[0, 'rgb(224,255,255)'], [0.01, 'rgb(166,206,227)'], [0.02, 'rgb(31,120,180)'],
                        [0.03, 'rgb(178,223,138)'], [0.05, 'rgb(51,160,44)'], [0.10, 'rgb(251,154,153)'], 
                        [0.20, 'rgb(255,255,0)'], [1, 'rgb(227,26,28)']],
            reversescale = False
        )

layout=dict(
            title = 'Numberof orders per country',
            geo = dict(showframe = True, projection = {'type':'mercator'})    
        )

choromap = go.Figure(data = [data], layout = layout)
iplot(choromap, validate = False)

We see that the dataset is largely dominated by orders made from the UK.

### Customers and Products


The dataframe contains  ∼400,000 entries. What are the number of users and products in these entries ?

In [127]:
# Count values in each column, column name and count
df.StockCode.value_counts()

StockCode
85123A    2313
22423     2203
85099B    2159
47566     1727
20725     1639
          ... 
21431        1
22275        1
17001        1
90187A       1
72759        1
Name: count, Length: 4070, dtype: int64