In [68]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt

In [69]:
data = pd.read_csv('./Dataset/House_Price_Dataset.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,Seller,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,PropertyCount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,03/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,03/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,04/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,04/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,04/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [70]:
np.unique(data["Seller"]).size

388

In [74]:
seller_gr = data.groupby(['Seller']).size().sort_values(ascending=False).reset_index(name='counts')
seller_gr.head()

Unnamed: 0,Seller,counts
0,Jellis,3359
1,Nelson,3236
2,Barry,3235
3,hockingstuart,2623
4,Marshall,2027


In [30]:
cut = data[["Method", "Seller", "RegionName"]]
cut = cut[(~cut['Method'].isnull()) & (~cut['Seller'].isnull()) & (~cut['RegionName'].isnull())]
cut.head()

Unnamed: 0,Method,Seller,RegionName
0,SS,Jellis,Northern Metropolitan
1,S,Biggin,Northern Metropolitan
2,S,Biggin,Northern Metropolitan
3,VB,Rounds,Northern Metropolitan
4,SP,Biggin,Northern Metropolitan


In [31]:
print(np.unique(cut["Method"]))

['PI' 'PN' 'S' 'SA' 'SN' 'SP' 'SS' 'VB' 'W']


In [32]:
cut["Method"] = cut["Method"].apply(lambda x: 'S' if x.startswith('S') else 'N')

In [33]:
np_cut = cut.to_numpy()

In [34]:
te = TransactionEncoder()
te_ary = te.fit(np_cut).transform(np_cut)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()

Unnamed: 0,@Realty,A,AIME,ASL,Abercromby's,Ace,Airport,Alex,Alexkarbon,Allan,...,hockingstuart/Marshall,hockingstuart/Sweeney,hockingstuart/Village,hockingstuart/hockingstuart,iHomes,iOne,iProperty,iSell,iTRAK,voglwalpole
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [35]:
res = fpgrowth(df, min_support=0.01, use_colnames=True, max_len=3)

In [36]:
res.head()

Unnamed: 0,support,itemsets
0,0.757904,(S)
1,0.274201,(Northern Metropolitan)
2,0.096345,(Jellis)
3,0.025736,(Biggin)
4,0.242096,(N)


In [37]:
res[res['itemsets'].map(len) == 3].sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets
47,0.043869,"(Northern Metropolitan, S, Nelson)"
54,0.041516,"(Marshall, Southern Metropolitan, S)"
39,0.036495,"(Jellis, S, Southern Metropolitan)"
87,0.032278,"(Buxton, S, Southern Metropolitan)"
62,0.02915,"(hockingstuart, S, Southern Metropolitan)"
74,0.024101,"(S, Northern Metropolitan, Barry)"
73,0.023039,"(Western Metropolitan, S, Barry)"
37,0.022006,"(Jellis, S, Northern Metropolitan)"
49,0.019998,"(Western Metropolitan, S, Nelson)"
81,0.018563,"(S, Northern Metropolitan, Ray)"


In [43]:
ar = association_rules(res, metric="confidence", min_threshold=0.1)

In [45]:
ar.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(S),(Northern Metropolitan),0.757904,0.274201,0.215011,0.283692,1.034612,0.007193,1.01325
1,(Northern Metropolitan),(S),0.274201,0.757904,0.215011,0.784137,1.034612,0.007193,1.121526
2,(Jellis),(S),0.096345,0.757904,0.071269,0.739726,0.976015,-0.001751,0.930157
3,(Jellis),(Northern Metropolitan),0.096345,0.274201,0.029064,0.301668,1.10017,0.002646,1.039332
4,(Northern Metropolitan),(Jellis),0.274201,0.096345,0.029064,0.105996,1.10017,0.002646,1.010795


In [48]:
ar[(ar['antecedents'].map(len) == 2) | (ar['consequents'].map(len) == 2)].sort_values(by=['confidence'], ascending=False).head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
141,"(Jas, S)",(Western Metropolitan),0.010587,0.195071,0.010558,0.99729,5.11245,0.008493,297.01885
59,"(N, Marshall)",(Southern Metropolitan),0.014919,0.339588,0.014374,0.963462,2.837148,0.009308,18.074434
55,"(Marshall, S)",(Southern Metropolitan),0.043238,0.339588,0.041516,0.960186,2.827502,0.026833,16.587348
135,"(Miles, S)",(Eastern Metropolitan),0.01119,0.125581,0.010185,0.910256,7.248361,0.00878,9.743526
125,"(Buxton, S)",(Southern Metropolitan),0.039479,0.339588,0.032278,0.817587,2.407586,0.018871,3.620426
113,"(Northern Metropolitan, Ray)",(S),0.022723,0.757904,0.018563,0.816919,1.077866,0.001341,1.322343
121,"(Buxton, N)",(Southern Metropolitan),0.014116,0.339588,0.011505,0.815041,2.400087,0.006712,3.570579
134,"(Miles, Eastern Metropolitan)",(S),0.012509,0.757904,0.010185,0.81422,1.074305,0.000704,1.303132
142,"(Jas, Western Metropolitan)",(S),0.013026,0.757904,0.010558,0.810573,1.069492,0.000686,1.27804
143,(Jas),"(Western Metropolitan, S)",0.013083,0.15169,0.010558,0.807018,5.32018,0.008574,4.395789
