In [93]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt

In [94]:
data = pd.read_csv('./Dataset/House_Price_Dataset.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,Seller,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,PropertyCount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,03/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,03/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,04/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,04/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,04/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [95]:
np.unique(data["Seller"]).size

388

In [96]:
seller_gr = data.groupby(['Seller']).size().sort_values(ascending=False).reset_index(name='counts')
seller_gr.head()

Unnamed: 0,Seller,counts
0,Jellis,3359
1,Nelson,3236
2,Barry,3235
3,hockingstuart,2623
4,Marshall,2027


In [97]:
cut = data[["Method", "Seller", "RegionName"]]
cut = cut[(~cut['Method'].isnull()) & (~cut['Seller'].isnull()) & (~cut['RegionName'].isnull())]
cut.head()

Unnamed: 0,Method,Seller,RegionName
0,SS,Jellis,Northern Metropolitan
1,S,Biggin,Northern Metropolitan
2,S,Biggin,Northern Metropolitan
3,VB,Rounds,Northern Metropolitan
4,SP,Biggin,Northern Metropolitan


In [98]:
print(np.unique(cut["Method"]))

['PI' 'PN' 'S' 'SA' 'SN' 'SP' 'SS' 'VB' 'W']


In [99]:
cut["Method"] = cut["Method"].apply(lambda x: 'S' if x.startswith('S') else 'N')

In [100]:
methods = [frozenset(x) for x in np.unique(cut["Method"])]
methods

[frozenset({'N'}), frozenset({'S'})]

In [101]:
np_cut = cut.to_numpy()

In [102]:
te = TransactionEncoder()
te_ary = te.fit(np_cut).transform(np_cut)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()

Unnamed: 0,@Realty,A,AIME,ASL,Abercromby's,Ace,Airport,Alex,Alexkarbon,Allan,...,hockingstuart/Marshall,hockingstuart/Sweeney,hockingstuart/Village,hockingstuart/hockingstuart,iHomes,iOne,iProperty,iSell,iTRAK,voglwalpole
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [103]:
res = fpgrowth(df, min_support=0.01, use_colnames=True, max_len=3)

In [104]:
res.head()

Unnamed: 0,support,itemsets
0,0.757904,(S)
1,0.274201,(Northern Metropolitan)
2,0.096345,(Jellis)
3,0.025736,(Biggin)
4,0.242096,(N)


In [105]:
res[res['itemsets'].map(len) == 3].sort_values(by=['support'], ascending=False)

Unnamed: 0,support,itemsets
47,0.043869,"(Northern Metropolitan, Nelson, S)"
54,0.041516,"(S, Marshall, Southern Metropolitan)"
39,0.036495,"(Southern Metropolitan, Jellis, S)"
87,0.032278,"(Southern Metropolitan, Buxton, S)"
62,0.02915,"(Southern Metropolitan, S, hockingstuart)"
74,0.024101,"(Northern Metropolitan, S, Barry)"
73,0.023039,"(S, Barry, Western Metropolitan)"
37,0.022006,"(Northern Metropolitan, Jellis, S)"
49,0.019998,"(Nelson, S, Western Metropolitan)"
81,0.018563,"(Northern Metropolitan, Ray, S)"


In [112]:
ar = association_rules(res, metric="support", min_threshold=0.01)

In [113]:
ar.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Northern Metropolitan),(S),0.274201,0.757904,0.215011,0.784137,1.034612,0.007193,1.121526
1,(S),(Northern Metropolitan),0.757904,0.274201,0.215011,0.283692,1.034612,0.007193,1.01325
2,(Jellis),(S),0.096345,0.757904,0.071269,0.739726,0.976015,-0.001751,0.930157
3,(S),(Jellis),0.757904,0.096345,0.071269,0.094034,0.976015,-0.001751,0.997449
4,(Northern Metropolitan),(Jellis),0.274201,0.096345,0.029064,0.105996,1.10017,0.002646,1.010795


In [114]:
ar[(ar['antecedents'].map(len) == 2) 
       & ((ar['consequents'].map(len) == 1) 
            & (ar['consequents'].isin(methods)))
  ].sort_values(by=['confidence', 'lift'], ascending=False
  ).head(100)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
156,"(Northern Metropolitan, Ray)",(S),0.022723,0.757904,0.018563,0.816919,1.077866,0.001341,1.322343
193,"(Eastern Metropolitan, Miles)",(S),0.012509,0.757904,0.010185,0.81422,1.074305,0.000704,1.303132
205,"(Jas, Western Metropolitan)",(S),0.013026,0.757904,0.010558,0.810573,1.069492,0.000686,1.27804
135,"(Northern Metropolitan, Barry)",(S),0.030269,0.757904,0.024101,0.796209,1.05054,0.001159,1.187957
106,"(hockingstuart, Western Metropolitan)",(S),0.013485,0.757904,0.010673,0.791489,1.044313,0.000453,1.161071
130,"(Barry, Western Metropolitan)",(S),0.029408,0.757904,0.023039,0.783415,1.033659,0.00075,1.117784
44,"(Northern Metropolitan, Nelson)",(S),0.05612,0.757904,0.043869,0.781697,1.031393,0.001335,1.108991
12,"(Northern Metropolitan, Jellis)",(S),0.029064,0.757904,0.022006,0.757157,0.999014,-2.2e-05,0.996922
99,"(Southern Metropolitan, hockingstuart)",(S),0.038503,0.757904,0.02915,0.757079,0.998911,-3.2e-05,0.996602
93,"(Northern Metropolitan, hockingstuart)",(S),0.01403,0.757904,0.01053,0.750511,0.990245,-0.000104,0.970367
