# Market Basket Analysis

In [1]:
!pip install apyori



<h3>Importing Libraries</h3>

In [2]:
import pandas as pd
import numpy as np
from apyori import apriori

 ## Data Loading 
 Reading the data

In [5]:
data=pd.read_excel("Assignment-1_Data.xlsx")

## EDA
Exploring the data

In [6]:
data.head()

Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [7]:
data.shape

(522064, 7)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522064 entries, 0 to 522063
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   BillNo      522064 non-null  object        
 1   Itemname    520609 non-null  object        
 2   Quantity    522064 non-null  int64         
 3   Date        522064 non-null  datetime64[ns]
 4   Price       522064 non-null  float64       
 5   CustomerID  388023 non-null  float64       
 6   Country     522064 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 27.9+ MB


In [9]:
data.describe()

Unnamed: 0,Quantity,Date,Price,CustomerID
count,522064.0,522064,522064.0,388023.0
mean,10.090435,2011-07-04 12:51:20.777107456,3.826801,15316.93171
min,-9600.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 10:15:00,1.25,13950.0
50%,3.0,2011-07-20 08:59:00,2.08,15265.0
75%,10.0,2011-10-19 14:12:00,4.13,16837.0
max,80995.0,2011-12-09 12:50:00,13541.33,18287.0
std,161.110525,,41.900599,1721.846964


## Data Preprocessing

<b><p style='font-size:19px'>Handling null values</p></b>

In [8]:
data.isnull().sum()

BillNo             0
Itemname        1455
Quantity           0
Date               0
Price              0
CustomerID    134041
Country            0
dtype: int64

<b style='font-size:18px'>We remove the null values from the data with respect to the Itemname.</b>

In [17]:
data.dropna(subset=['Itemname'],inplace=True)

In [18]:
data.isnull().sum()

BillNo             0
Itemname           0
Quantity           0
Date               0
Price              0
CustomerID    132512
Country            0
dtype: int64

<b><p style='font-size:19px'>Handling Duplicates</p></b>

In [19]:
data.duplicated().sum()

0

In [20]:
data.drop_duplicates(inplace=True)

In [21]:
data.duplicated().sum()

0

In [22]:
data.head()

Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


<b style='font-size:18px'>Let's extract Itemnames for each BillNo. and store it in a list.</b>

In [23]:
items=data.set_index(['BillNo']).stack()

In [24]:
items

BillNo            
536365  Itemname      WHITE HANGING HEART T-LIGHT HOLDER
        Quantity                                       6
        Date                         2010-12-01 08:26:00
        Price                                       2.55
        CustomerID                               17850.0
                                     ...                
581587  Quantity                                       3
        Date                         2011-12-09 12:50:00
        Price                                       4.95
        CustomerID                               12680.0
        Country                                   France
Length: 2959426, dtype: object

In [25]:
item=pd.DataFrame(items)

In [26]:
item.rename(columns={0:'list'},inplace=True)

In [27]:
item.index.levels[0]

Index([   536365,    536366,    536367,    536368,    536369,    536370,
          536371,    536372,    536373,    536374,
       ...
          581581,    581582,    581583,    581584,    581585,    581586,
          581587, 'A563185', 'A563186', 'A563187'],
      dtype='object', name='BillNo', length=20208)

In [28]:
products=[]

for i in (item.index.levels[0]):
    product=item.loc[(i,['Itemname']),'list'].to_list()
    
    products.append(product)

In [29]:
products[0:5]

[['WHITE HANGING HEART T-LIGHT HOLDER',
  'WHITE METAL LANTERN',
  'CREAM CUPID HEARTS COAT HANGER',
  'KNITTED UNION FLAG HOT WATER BOTTLE',
  'RED WOOLLY HOTTIE WHITE HEART.',
  'SET 7 BABUSHKA NESTING BOXES',
  'GLASS STAR FROSTED T-LIGHT HOLDER'],
 ['HAND WARMER UNION JACK', 'HAND WARMER RED POLKA DOT'],
 ['ASSORTED COLOUR BIRD ORNAMENT',
  "POPPY'S PLAYHOUSE BEDROOM",
  "POPPY'S PLAYHOUSE KITCHEN",
  'FELTCRAFT PRINCESS CHARLOTTE DOLL',
  'IVORY KNITTED MUG COSY',
  'BOX OF 6 ASSORTED COLOUR TEASPOONS',
  'BOX OF VINTAGE JIGSAW BLOCKS',
  'BOX OF VINTAGE ALPHABET BLOCKS',
  'HOME BUILDING BLOCK WORD',
  'LOVE BUILDING BLOCK WORD',
  'RECIPE BOX WITH METAL HEART',
  'DOORMAT NEW ENGLAND'],
 ['JAM MAKING SET WITH JARS',
  'RED COAT RACK PARIS FASHION',
  'YELLOW COAT RACK PARIS FASHION',
  'BLUE COAT RACK PARIS FASHION'],
 ['BATH BUILDING BLOCK WORD']]

## Model selection  And Training Model
Buliding rules to the Apriori model

In [31]:
rules=apriori(transactions=products,min_support=0.003,min_confidence=0.8,min_lift=3,min_length=2,max_length=3)

In [32]:
results=list(rules)

## Model Evaluation or Validation

In [33]:
def inspect(results):
    lhs=[tuple(result[2][0][0])[0] for result in results]
    rhs=[tuple(result[2][0][1])[0] for result in results]
    supports=[result[1] for result in results]
    confidences=[result[2][0][2] for result in results]
    lifts=[result[2][0][3] for result in results]

    return list(zip(lhs,rhs,supports,confidences,lifts))

In [25]:
result=pd.DataFrame(inspect(results),columns=['Left Hand Side','Right Hand Side','Support','Confidence','Lift'])

In [26]:
result.sort_values(by='Confidence',ascending=False)

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
1119,CHARLIE+LOLA PINK HOT WATER BOTTLE,DOTCOM POSTAGE,0.004107,1.0,28.542373
2560,CANDY SPOT CUSHION COVER,DOTCOM POSTAGE,0.003068,1.0,28.542373
1599,RED RETROSPOT CHARLOTTE BAG,DOTCOM POSTAGE,0.005641,1.0,28.542373
1598,RECYCLING BAG RETROSPOT,DOTCOM POSTAGE,0.005592,1.0,28.542373
1596,PINK VINTAGE PAISLEY PICNIC BAG,DOTCOM POSTAGE,0.003365,1.0,28.542373
...,...,...,...,...,...
3652,CHARLIE+LOLA PINK HOT WATER BOTTLE,SUKI SHOULDER BAG,0.003167,0.8,30.676281
9805,WHITE HANGING HEART T-LIGHT HOLDER,JAM MAKING SET PRINTED,0.003167,0.8,14.486022
9815,ROLL WRAP VINTAGE CHRISTMAS,RECIPE BOX PANTRY YELLOW DESIGN,0.003167,0.8,14.899908
5912,GREEN REGENCY TEACUP AND SAUCER,DOTCOM POSTAGE,0.004751,0.8,22.833898


<b style='font-size:18px'>The above data frame says that if a person bought a product in Left Hand Side, with some confidence he bought product from Right Hand Side.</b>