In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [27]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram

In [28]:
file = r'C:/Users/suresh/Downloads/Online Retail.xlsx'
df = pd.read_excel(file)

In [29]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Sales'] = df['Quantity']*df['UnitPrice']
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Sales
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [30]:
print('Total number of customers is:', df['CustomerID'].nunique())
print('Total number of transactions that occured is:', df['InvoiceNo'].nunique())
print('Total number of products sold is:', df['StockCode'].nunique())

Total number of customers is: 4372
Total number of transactions that occured is: 25900
Total number of products sold is: 4070


In [31]:
print('Orders came from ',df['Country'].nunique(),'different countries.')

Orders came from  38 different countries.


In [32]:
customers_by_countries = df.copy()
customers_by_countries.drop_duplicates(subset=['CustomerID'],inplace = True)
countries = customers_by_countries['Country'].value_counts()
countries

United Kingdom          3951
Germany                   95
France                    87
Spain                     29
Belgium                   24
Switzerland               20
Portugal                  19
Italy                     15
Finland                   12
Norway                    10
Channel Islands            9
Austria                    9
Netherlands                9
Australia                  9
Sweden                     8
Denmark                    8
Japan                      8
Cyprus                     7
Poland                     6
Unspecified                4
Greece                     4
USA                        4
Canada                     4
Israel                     4
EIRE                       3
United Arab Emirates       2
Malta                      2
Bahrain                    2
Lithuania                  1
European Community         1
Saudi Arabia               1
Iceland                    1
Brazil                     1
Czech Republic             1
Lebanon       

In [None]:
df['monthly sales'] = pd.to_datetime(df['InvoiceDate']).dt.strftime('%Y-%m')
monthly_sales = df.groupby(['monthly sales'],as_index = False).agg({'Sales':'sum'})
monthly_sales

In [None]:
sns.lineplot(monthly_sales['monthly sales'],monthly_sales['Sales'])
sns.scatterplot(monthly_sales['monthly sales'],monthly_sales['Sales'])
plt.xticks(rotation = 45);

In [None]:
customers = df.groupby(['CustomerID'],as_index = False).agg({'Sales':'sum','InvoiceNo':'count','Country':'first'})
customers

In [None]:
encoder = LabelEncoder()
customers['Country'] = encoder.fit_transform(customers['Country'])

In [None]:
X = customers[['Sales','InvoiceNo','Country']]
inertias = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i,random_state=0)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

print(pd.DataFrame(inertias,columns=['inertia_values']))

In [None]:
sns.set_style('darkgrid')
sns.lineplot(range(1,11),inertias)
plt.xlabel('Number of clusters')
plt.ylabel('inertia values')
plt.title('Number of clusters vs inertia values')

In [None]:
Text(0.5, 1.0, 'Number of clusters vs inertia values')


In [None]:
kmeans = KMeans(n_clusters=3,random_state=0)
kmeans.fit(X)

In [None]:
KMeans(n_clusters=3, random_state=0)

In [None]:
Z = linkage(X, method='ward',metric='euclidean')
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(Z,truncate_mode='lastp',p=12,show_leaf_counts=False,  
            leaf_rotation=90.,leaf_font_size=12.,show_contracted=True,)
plt.show()