# Loading the Libraries

In [1]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 

# Data Processing And Cleaning

In [2]:
#Loading the dataset
df=pd.read_csv('transaction_data.csv')

In [3]:
#Filling NA values
df['ItemDescription'].fillna('Not Mentioned',inplace=True)
df=df[~(df['UserId']==-1)]

In [4]:
df.head()

Unnamed: 0,UserId,TransactionId,TransactionTime,ItemCode,ItemDescription,NumberOfItemsPurchased,CostPerItem,Country
0,278166,6355745,Sat Feb 02 12:50:00 IST 2019,465549,FAMILY ALBUM WHITE PICTURE FRAME,6,11.73,United Kingdom
1,337701,6283376,Wed Dec 26 09:06:00 IST 2018,482370,LONDON BUS COFFEE MUG,3,3.52,United Kingdom
2,267099,6385599,Fri Feb 15 09:45:00 IST 2019,490728,SET 12 COLOUR PENCILS DOLLY GIRL,72,0.9,France
3,380478,6044973,Fri Jun 22 07:14:00 IST 2018,459186,UNION JACK FLAG LUGGAGE TAG,3,1.73,United Kingdom
5,285957,6307136,Fri Jan 11 09:50:00 IST 2019,1787247,CUT GLASS T-LIGHT HOLDER OCTAGON,12,3.52,United Kingdom


In [5]:
# Transactions done in France 
basket_France = (df[df['Country'] =="France"] 
          .groupby(['TransactionId', 'ItemDescription'])['NumberOfItemsPurchased'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('TransactionId')) 
  
# Transactions done in the United Kingdom 
basket_UK = (df[df['Country'] =="United Kingdom"] 
          .groupby(['TransactionId', 'ItemDescription'])['NumberOfItemsPurchased'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('TransactionId')) 
  
# Transactions done in Germany 
basket_ger = (df[df['Country'] =="Germany"] 
          .groupby(['TransactionId', 'ItemDescription'])['NumberOfItemsPurchased'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('TransactionId')) 

# Transactions done in Other Countries
basket_other = (df[~(df['Country']=='United Kingdom')&~(df['Country']=='France')&~(df['Country']=='Germany')] 
          .groupby(['TransactionId', 'ItemDescription'])['NumberOfItemsPurchased'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('TransactionId')) 

In [6]:
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

In [7]:
# Encoding the datasets 
basket_encoded = basket_France.applymap(hot_encode) 
basket_France = basket_encoded 
  
basket_encoded = basket_UK.applymap(hot_encode) 
basket_UK = basket_encoded 
  
basket_encoded = basket_ger.applymap(hot_encode) 
basket_ger = basket_encoded 

basket_encoded = basket_other.applymap(hot_encode) 
basket_other = basket_encoded 

In [8]:
#France
# Building the model 
frq_items = apriori(basket_France, min_support = 0.01, use_colnames = True) 
  
# Collecting the inferred rules in a dataframe 
rules1 = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules1 = rules1.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules1.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5372,(SMALL MARSHMALLOWS PINK BOWL),(SMALL DOLLY MIX DESIGN ORANGE BOWL),0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
5373,(SMALL DOLLY MIX DESIGN ORANGE BOWL),(SMALL MARSHMALLOWS PINK BOWL),0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
86707,"(ALARM CLOCK BAKELIKE PINK, PLASTERS IN TIN CI...","(PLASTERS IN TIN SPACEBOY, ALARM CLOCK BAKELIK...",0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
86726,"(PLASTERS IN TIN SPACEBOY, ALARM CLOCK BAKELIK...","(ALARM CLOCK BAKELIKE PINK, PLASTERS IN TIN CI...",0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
89624,"(ALARM CLOCK BAKELIKE RED , CHILDRENS CUTLERY ...","(ALARM CLOCK BAKELIKE PINK, CARD DOLLY GIRL )",0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf


In [9]:
#UK
# Building the model 
frq_items = apriori(basket_UK, min_support = 0.01, use_colnames = True) 
rules2 = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules2 = rules2.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules2.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
540,"(REGENCY CAKESTAND 3 TIER, PINK REGENCY TEACUP...",(GREEN REGENCY TEACUP AND SAUCER),0.011331,0.03082,0.010173,0.897778,29.129368,0.009824,9.481105
384,"(PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY...",(GREEN REGENCY TEACUP AND SAUCER),0.019288,0.03082,0.017173,0.890339,28.888023,0.016578,8.837995
538,"(REGENCY CAKESTAND 3 TIER, GREEN REGENCY TEACU...",(ROSES REGENCY TEACUP AND SAUCER ),0.011583,0.034144,0.010173,0.878261,25.722162,0.009777,7.933816
377,"(REGENCY CAKESTAND 3 TIER, PINK REGENCY TEACUP...",(GREEN REGENCY TEACUP AND SAUCER),0.013345,0.03082,0.011583,0.867925,28.160747,0.011172,7.338074
532,"(REGENCY CAKESTAND 3 TIER, PINK REGENCY TEACUP...",(ROSES REGENCY TEACUP AND SAUCER ),0.013345,0.034144,0.011331,0.849057,24.866839,0.010875,6.398795


In [10]:
#Germany
# Building the model 
frq_items = apriori(basket_ger,min_support = 0.01, use_colnames = True) 
rules3 = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules3 = rules3.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules3.head() 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7124,"(BLUE VINTAGE SPOT BEAKER, GREEN VINTAGE SPOT ...","(RED VINTAGE SPOT BEAKER, PINK VINTAGE SPOT BE...",0.011609,0.013267,0.011609,1.0,75.375,0.011455,inf
7574,"(ROUND SNACK BOXES SET OF4 WOODLAND , SCANDINA...","(CHOCOLATE BOX RIBBONS , ROUND SNACK BOXES SET...",0.011609,0.013267,0.011609,1.0,75.375,0.011455,inf
7577,"(ROUND SNACK BOXES SET OF 4 FRUITS , SCANDINAV...","(ROUND SNACK BOXES SET OF4 WOODLAND , CHOCOLAT...",0.011609,0.013267,0.011609,1.0,75.375,0.011455,inf
682,(STAR WREATH DECORATION WITH BELL),(HEART WREATH DECORATION WITH BELL),0.011609,0.014925,0.011609,1.0,67.0,0.011435,inf
7029,"(RED SPOT CERAMIC DRAWER KNOB, WHITE SPOT BLUE...","(BLUE SPOT CERAMIC DRAWER KNOB, WHITE SPOT RED...",0.013267,0.016584,0.013267,1.0,60.3,0.013047,inf


In [11]:
#France
# Other Countries
frq_items = apriori(basket_other, min_support = 0.01, use_colnames = True) 
rules4 = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules4 = rules4.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules4.head() 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2829,"(PACK OF 6 SKULL PAPER PLATES, PACK OF 20 SKUL...",(PACK OF 6 SKULL PAPER CUPS),0.011792,0.014937,0.011792,1.0,66.947368,0.011616,inf
4072,"(SPACEBOY CHILDRENS CUP, DOLLY GIRL CHILDRENS ...","(DOLLY GIRL CHILDRENS CUP, SPACEBOY CHILDRENS ...",0.013365,0.015723,0.013365,1.0,63.6,0.013155,inf
1982,"(DOLLY GIRL CHILDRENS CUP, SPACEBOY CHILDRENS ...",(DOLLY GIRL CHILDRENS BOWL),0.015723,0.018082,0.015723,1.0,55.304348,0.015439,inf
4066,"(DOLLY GIRL CHILDRENS CUP, SPACEBOY CHILDRENS ...",(DOLLY GIRL CHILDRENS BOWL),0.013365,0.018082,0.013365,1.0,55.304348,0.013123,inf
13869,"(REGENCY CAKESTAND 3 TIER, REGENCY TEAPOT ROSE...","(REGENCY SUGAR BOWL GREEN, REGENCY TEA PLATE G...",0.012579,0.018082,0.012579,1.0,55.304348,0.012351,inf


In [12]:
#Combining all the rules
rules=pd.concat([rules1,rules2,rules3,rules4])
rules.reset_index(inplace=True)
rules.drop('index',axis=1,inplace=True)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(SMALL MARSHMALLOWS PINK BOWL),(SMALL DOLLY MIX DESIGN ORANGE BOWL),0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
1,(SMALL DOLLY MIX DESIGN ORANGE BOWL),(SMALL MARSHMALLOWS PINK BOWL),0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
2,"(ALARM CLOCK BAKELIKE PINK, PLASTERS IN TIN CI...","(PLASTERS IN TIN SPACEBOY, ALARM CLOCK BAKELIK...",0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
3,"(PLASTERS IN TIN SPACEBOY, ALARM CLOCK BAKELIK...","(ALARM CLOCK BAKELIKE PINK, PLASTERS IN TIN CI...",0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf
4,"(ALARM CLOCK BAKELIKE RED , CHILDRENS CUTLERY ...","(ALARM CLOCK BAKELIKE PINK, CARD DOLLY GIRL )",0.010917,0.010917,0.010917,1.0,91.6,0.010798,inf


In [16]:
#Removing Duplicates Value
rules=rules[~(rules.duplicated(['antecedents']))]

In [17]:
#Keeping necessy columns
rules_final=rules[['antecedents','consequents']]
rules_final.head()

Unnamed: 0,antecedents,consequents
0,(SMALL MARSHMALLOWS PINK BOWL),(SMALL DOLLY MIX DESIGN ORANGE BOWL)
1,(SMALL DOLLY MIX DESIGN ORANGE BOWL),(SMALL MARSHMALLOWS PINK BOWL)
2,"(ALARM CLOCK BAKELIKE PINK, PLASTERS IN TIN CI...","(PLASTERS IN TIN SPACEBOY, ALARM CLOCK BAKELIK..."
3,"(PLASTERS IN TIN SPACEBOY, ALARM CLOCK BAKELIK...","(ALARM CLOCK BAKELIKE PINK, PLASTERS IN TIN CI..."
4,"(ALARM CLOCK BAKELIKE RED , CHILDRENS CUTLERY ...","(ALARM CLOCK BAKELIKE PINK, CARD DOLLY GIRL )"


In [18]:
rules_final.reset_index(inplace=True)
rules_final.drop('index',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [23]:
for i in range(len(rules_final)):
    x=list(rules_final['consequents'][i])
    y=list(rules_final['antecedents'][i])
    rules_final['antecedents'][i]=y
    rules_final['consequents'][i]=x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [27]:
for i in range(len(rules_final)):

    x=rules_final['antecedents'][i]
    x=x[0]
    rules_final['antecedents'][i]=x
    y=rules_final['consequents'][i]
    y=y[:2]
    rules_final['consequents'][i]=y

In [29]:
#Cleaning the data to produce final results
rules_final.columns=['Item','Other Item Bought with it']
rules_final['Length']=rules_final['Item']
rules_final['Length']=rules_final['Other Item Bought with it'].apply(lambda x:len(x))
rules_final=rules_final[rules_final['Length']>1]
rules_final.drop('Length',axis=1,inplace=True)

from sklearn.utils import shuffle
rules_final=shuffle(rules_final)

rules_final.reset_index(inplace=True)
rules_final.drop('index',axis=1,inplace=True)
rules_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Item,Other Item Bought with it
0,SET/6 RED SPOTTY PAPER PLATES,"[ROUND SNACK BOXES SET OF4 WOODLAND , PLASTERS..."
1,ALARM CLOCK BAKELIKE GREEN,"[SET/6 RED SPOTTY PAPER PLATES, ALARM CLOCK BA..."
2,ROUND SNACK BOXES SET OF4 WOODLAND,"[ALARM CLOCK BAKELIKE RED , ALARM CLOCK BAKELI..."
3,SET/6 RED SPOTTY PAPER PLATES,"[SET/20 RED RETROSPOT PAPER NAPKINS , SET/6 RE..."
4,PLASTERS IN TIN CIRCUS PARADE,"[ALARM CLOCK BAKELIKE PINK, PLASTERS IN TIN SP..."


In [30]:
rules_final.to_csv('Assignment-2.csv')