#### Group

### Imports

In [None]:
!pip install mlxtend
#import sys
#!{sys.executable} -m pip install mlxtend

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import (apriori,
                                       association_rules)
from collections import Counter

In [2]:
from mlxtend.frequent_patterns import fpgrowth

### Data

In [3]:
itemsets = pd.read_pickle("./itemsets.pickle")

In [4]:
itemsets[0:2]

[['Face',
  'Grand Theft Auto',
  'Halo',
  'Head',
  'Human',
  'Indoors',
  'Man',
  'Person'],
 ['Accessories',
  'Accessory',
  'Apparel',
  'Banister',
  'Clothing',
  'Coat',
  'Handrail',
  'Human',
  'Indoors',
  'Jacket',
  'Overcoat',
  'Person',
  'Prison',
  'Railing',
  'Staircase',
  'Suit',
  'Tie']]

In [7]:
len(itemsets)

2851272

In [9]:
itemsets[0][0]

'Face'

#### Inspect items

In [5]:
list_itemsets = [inner for outer in itemsets for inner in outer]

In [11]:
list_itemsets[0:10]

['Face',
 'Grand Theft Auto',
 'Halo',
 'Head',
 'Human',
 'Indoors',
 'Man',
 'Person',
 'Accessories',
 'Accessory']

In [6]:
count_items = Counter(list_itemsets)

In [7]:
count_items.most_common()[0:10]

[('Human', 2635223),
 ('Person', 2635223),
 ('Apparel', 1789043),
 ('Clothing', 1789043),
 ('Face', 1128286),
 ('Indoors', 951423),
 ('Coat', 728494),
 ('Room', 713099),
 ('Furniture', 657729),
 ('Man', 610639)]

In [20]:
count_items.most_common()[0][0]

'Human'

#### Test counter

In [10]:
z = ['green','blue', 'red', 'blue', 'yellow', 'blue', 'red']
count_z = Counter(z)

In [11]:
count_z.most_common()

[('blue', 3), ('red', 2), ('green', 1), ('yellow', 1)]

### Part 2 Association Rules

In [6]:
def get_df_items(itemsets):
    transaction_encoder = TransactionEncoder()
    transaction_encoded_ary = transaction_encoder.fit(itemsets).transform(itemsets)
    #Dataframe
    df = pd.DataFrame(transaction_encoded_ary, columns= transaction_encoder.columns_)
    return df 

#### Data prep

In [18]:
from mlxtend.preprocessing import TransactionEncoder

In [26]:
te = TransactionEncoder()
te_ary = te.fit(itemsets).transform(itemsets)
te_ary

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [27]:
#Dataframe
df = pd.DataFrame(te_ary, columns=te.columns_)

In [32]:
df['Female'].value_counts()

False    2258011
True      593261
Name: Female, dtype: int64

In [31]:
len(df['Female'] == False) 

2851272

#### Frequent Items

In [5]:
def get_freq_items(minimum_support, df):
    
    frequent_itemsets = apriori(df, 
        min_support = minimum_support,   
        use_colnames=True
        )
    return frequent_itemsets

In [59]:
#Initialise arbitrary minimum support
minimum_support = 0.1 #Arbitrary - try others e.g - 0.1, 0.2, 0.3

In [60]:
#Apriori Algorithm
frequent_itemsets = apriori(df, 
        min_support = minimum_support,   
        use_colnames=True
        )

In [37]:
#Display
frequent_itemsets = frequent_itemsets[['itemsets', 'support']] #Change column order
frequent_itemsets

Unnamed: 0,itemsets,support
0,(Apparel),0.627454
1,(Clothing),0.627454
2,(Coat),0.255498
3,(Face),0.395713
4,(Female),0.208069
...,...,...
78,"(Human, Clothing, Person, Indoors)",0.229986
79,"(Room, Human, Person, Indoors)",0.244616
80,"(Human, Clothing, Person, Apparel, Coat)",0.254630
81,"(Human, Clothing, Person, Apparel, Face)",0.277243


#### Rules

In [55]:
#Initialise minimum confidence
min_confidence = 0.05 #Arbitrary - try others e.g - 0.2, 0.5 

In [56]:
#Rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = min_confidence)

In [40]:
#Display
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', len(rules.columns))
rules

  


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Clothing),(Apparel),0.627454,0.627454,0.627454,1.000000,1.593741,0.233755,inf
1,(Apparel),(Clothing),0.627454,0.627454,0.627454,1.000000,1.593741,0.233755,inf
2,(Apparel),(Coat),0.627454,0.255498,0.255498,0.407198,1.593741,0.095185,1.255903
3,(Coat),(Apparel),0.255498,0.627454,0.255498,1.000000,1.593741,0.095185,inf
4,(Apparel),(Face),0.627454,0.395713,0.278967,0.444601,1.123544,0.030675,1.088024
...,...,...,...,...,...,...,...,...,...
503,(Human),"(Apparel, Clothing, Person, Indoors)",0.924227,0.229986,0.229986,0.248841,1.081985,0.017427,1.025102
504,(Clothing),"(Human, Apparel, Person, Indoors)",0.627454,0.229986,0.229986,0.366538,1.593741,0.085680,1.215565
505,(Person),"(Human, Clothing, Apparel, Indoors)",0.924227,0.229986,0.229986,0.248841,1.081985,0.017427,1.025102
506,(Apparel),"(Human, Clothing, Person, Indoors)",0.627454,0.229986,0.229986,0.366538,1.593741,0.085680,1.215565


#### Filter Female Rules

In [57]:
rules_female = rules[rules['antecedents'] == {'Female'}] #Woman, girl

In [58]:
rules_female

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
30,(Female),(Human),0.208069,0.924227,0.206738,0.993603,1.075064,0.014435,11.845343
32,(Female),(Person),0.208069,0.924227,0.206738,0.993603,1.075064,0.014435,11.845343
183,(Female),"(Human, Person)",0.208069,0.924227,0.206738,0.993603,1.075064,0.014435,11.845343


### Part 2: Filter for female

In [3]:
filterX = 'Female'
itemsets_female = [x for x in itemsets if filterX in x]

In [7]:
#Create ohe dataframe
df_female = get_df_items(itemsets_female)

In [49]:
df_female.head()

Unnamed: 0,Abies,Absinthe,Abyssinian,Acanthaceae,Accessories,Accessory,Accipiter,Accordion,Acrobatic,Adapter,...,Wristwatch,Xylophone,Yacht,Yard,Yarn,Yew,Yoga,Zebra,Zebra Crossing,Zoo
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### 1. Fpgrowth

In [48]:
df_female_fp = fpgrowth(df_female, min_support= 0.01, max_len = 1, use_colnames=True)

In [44]:
df_female_fp.head(60)

Unnamed: 0,support,itemsets
0,1.0,(Female)
1,0.993603,(Human)
2,0.993603,(Person)
3,0.656013,(Face)
4,0.530471,(Woman)
5,0.356743,(Hair)
6,0.337775,(Indoors)
7,0.27405,(Photography)
8,0.27405,(Photo)
9,0.219999,(Head)


### 2. Frequent Items

In [8]:
min_support = 0.2
df_freq_female = get_freq_items(min_support, df_female)

In [9]:
df_freq_female.head(30)

Unnamed: 0,support,itemsets
0,0.84435,(Apparel)
1,0.208746,(Child)
2,0.84435,(Clothing)
3,0.251771,(Coat)
4,0.656013,(Face)
5,0.205318,(Fashion)
6,1.0,(Female)
7,0.237302,(Furniture)
8,0.359331,(Girl)
9,0.356743,(Hair)


In [23]:
df_freq_female.sort_values(by = ['support'], ascending = False) #.head()

Unnamed: 0,support,itemsets
6,1.000000,(Female)
11,0.993603,(Human)
279,0.993603,"(Human, Female, Person)"
77,0.993603,"(Human, Female)"
81,0.993603,"(Female, Person)"
...,...,...
424,0.200431,"(Photo, Person, Photography, Apparel)"
358,0.200431,"(Human, Photo, Clothing, Apparel)"
398,0.200431,"(Human, Photo, Female, Apparel)"
722,0.200431,"(Human, Photo, Clothing, Person, Apparel, Female)"


In [20]:
df_freq_female.to_csv('df_freq_female.csv')

In [37]:
#Display
frequent_itemsets = frequent_itemsets[['itemsets', 'support']] #Change column order
frequent_itemsets

Unnamed: 0,itemsets,support
0,(Apparel),0.627454
1,(Clothing),0.627454
2,(Coat),0.255498
3,(Face),0.395713
4,(Female),0.208069
...,...,...
78,"(Human, Clothing, Person, Indoors)",0.229986
79,"(Room, Human, Person, Indoors)",0.244616
80,"(Human, Clothing, Person, Apparel, Coat)",0.254630
81,"(Human, Clothing, Person, Apparel, Face)",0.277243


#### Save and read in

In [24]:
df2 = pd.read_csv('df_freq_female.csv')

In [26]:
df2.head(30)

Unnamed: 0.1,Unnamed: 0,support,itemsets
0,0,0.84435,frozenset({'Apparel'})
1,1,0.208746,frozenset({'Child'})
2,2,0.84435,frozenset({'Clothing'})
3,3,0.251771,frozenset({'Coat'})
4,4,0.656013,frozenset({'Face'})
5,5,0.205318,frozenset({'Fashion'})
6,6,1.0,frozenset({'Female'})
7,7,0.237302,frozenset({'Furniture'})
8,8,0.359331,frozenset({'Girl'})
9,9,0.356743,frozenset({'Hair'})


### Association Rules

#### Data prep

In [18]:
from mlxtend.preprocessing import TransactionEncoder

In [14]:
te = TransactionEncoder()
te_ary = te.fit(itemsets_female).transform(itemsets_female)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [16]:
#Dataframe
df_female = pd.DataFrame(te_ary, columns=te.columns_)

### Part 3: Filter for Male

In [15]:
filterXX = 'Man'
itemsets_male = [x for x in itemsets if filterXX in x]

In [16]:
#Create ohe dataframe
df_male = get_df_items(itemsets_male)

In [17]:
df_male

Unnamed: 0,Abies,Absinthe,Acanthaceae,Accessories,Accessory,Accipiter,Accordion,Acrobatic,Adapter,Adventure,...,Wristwatch,X-Ray,Xylophone,Yacht,Yard,Yew,Yoga,Zebra Crossing,Zipper,Zoo
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610634,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
610635,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
610636,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
610637,False,False,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


#### Frequent Items

In [18]:
min_support = 0.2
df_freq_male = get_freq_items(min_support, df_male)

In [19]:
df_freq_male.head(30)

Unnamed: 0,support,itemsets
0,0.254641,(Accessories)
1,0.254641,(Accessory)
2,0.748595,(Apparel)
3,0.748595,(Clothing)
4,0.403901,(Coat)
5,0.764147,(Face)
6,0.290034,(Head)
7,0.998369,(Human)
8,0.379239,(Indoors)
9,1.0,(Man)


In [22]:
df_freq_male.tail(50)

Unnamed: 0,support,itemsets
797,0.221732,"(Face, Human, Person, Apparel, Overcoat, Man)"
798,0.233441,"(Human, Indoors, Person, Apparel, Man, Room)"
799,0.239457,"(Human, Person, Apparel, Overcoat, Suit, Man)"
800,0.205681,"(Human, Photo, Person, Photography, Apparel, Man)"
801,0.221732,"(Face, Human, Clothing, Overcoat, Man, Coat)"
802,0.287548,"(Face, Human, Clothing, Person, Man, Coat)"
803,0.221732,"(Face, Clothing, Human, Person, Overcoat, Coat)"
804,0.221732,"(Face, Clothing, Person, Overcoat, Man, Coat)"
805,0.310224,"(Human, Clothing, Person, Overcoat, Man, Coat)"
806,0.239457,"(Human, Clothing, Overcoat, Suit, Man, Coat)"


#### Min Support = 10%

In [23]:
min_support = 0.1
df_freq_male = get_freq_items(min_support, df_male)

In [19]:
df_freq_male.head(30)

Unnamed: 0,support,itemsets
0,0.254641,(Accessories)
1,0.254641,(Accessory)
2,0.748595,(Apparel)
3,0.748595,(Clothing)
4,0.403901,(Coat)
5,0.764147,(Face)
6,0.290034,(Head)
7,0.998369,(Human)
8,0.379239,(Indoors)
9,1.0,(Man)


In [26]:
#Support = 10%
df_freq_male.head(30)

Unnamed: 0,support,itemsets
0,0.254641,(Accessories)
1,0.254641,(Accessory)
2,0.748595,(Apparel)
3,0.140513,(Bar Counter)
4,0.748595,(Clothing)
5,0.403901,(Coat)
6,0.128781,(Court)
7,0.18198,(Crowd)
8,0.764147,(Face)
9,0.114521,(Female)


### FPGrowth

In [30]:
df_male_freqfp = fpgrowth(df_male, min_support= 0.05, use_colnames=True)

In [33]:
df_male_freqfp.head(60)

Unnamed: 0,support,itemsets
0,1.0,(Man)
1,0.998369,(Person)
2,0.998369,(Human)
3,0.764147,(Face)
4,0.379239,(Indoors)
5,0.290034,(Head)
6,0.328649,(Photo)
7,0.328649,(Photography)
8,0.236626,(Portrait)
9,0.149003,(Pub)


In [36]:
df_male_freqfp2 = fpgrowth(df_male, min_support= 0.01, use_colnames=True)

In [41]:
df_male_freqfp2.head(60)

Unnamed: 0,support,itemsets
0,1.0,(Man)
1,0.998369,(Person)
2,0.998369,(Human)
3,0.764147,(Face)
4,0.379239,(Indoors)
5,0.290034,(Head)
6,0.043599,(Grand Theft Auto)
7,0.328649,(Photography)
8,0.328649,(Photo)
9,0.236626,(Portrait)
