# Association Rule Mining

### Import data

In [12]:
import arff 
import pandas as pd, numpy as np
from IPython.display import display, HTML
from sklearn.preprocessing import LabelBinarizer #for dummification
from sklearn.feature_selection import SelectKBest, f_classif
from mlxtend.frequent_patterns import apriori, association_rules

data = pd.read_csv('dataset1/pd_speech_features_copy1.csv', sep=',')
data = data.sort_values('id', ascending=True)
data = data.groupby('id').mean().reset_index()


print(data.shape)

(252, 755)


In [13]:
data.head(5)

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1.0,0.823387,0.69637,0.56725,235.333333,234.333333,0.00822,7.3e-05,0.001963,...,1.561733,2.862,12.293333,9.7175,9.0553,6.2591,4.021533,4.164333,22.9617,1.0
1,1,0.0,0.415637,0.793993,0.592453,213.333333,211.0,0.008884,0.001849,0.00579,...,4.918567,4.827133,6.117633,8.599667,5.737233,7.933133,5.490533,4.941833,4.467233,1.0
2,2,1.0,0.801973,0.619967,0.520563,319.333333,318.333333,0.006041,0.000104,0.002217,...,41.1294,31.201933,14.584467,5.4468,3.462,4.772067,9.176633,11.8481,5.552367,1.0
3,3,0.0,0.828707,0.626097,0.537183,493.0,492.0,0.003913,4.2e-05,0.000757,...,1.677633,1.9084,2.842167,3.493867,3.282433,3.085267,3.184433,4.032933,22.773633,1.0
4,4,0.0,0.831287,0.779397,0.726717,362.666667,361.666667,0.005622,0.002023,0.003593,...,4.1046,4.285233,2.9532,2.799933,2.6451,2.811367,7.268333,13.338833,63.7669,1.0


### Feature Selection

In [14]:
X = data.drop(columns=['class'])
y = data['class'].values

kbest_classifier = SelectKBest(score_func=f_classif, k=10)
kbest_classifier.fit_transform(X,y)

kbest_columns = kbest_classifier.get_support(indices=True)
kbest_df = data.iloc[:,kbest_columns]

kbest_df.head(5)

Unnamed: 0,mean_MFCC_2nd_coef,std_8th_delta_delta,std_9th_delta_delta,tqwt_stdValue_dec_11,tqwt_stdValue_dec_12,tqwt_minValue_dec_11,tqwt_minValue_dec_12,tqwt_maxValue_dec_11,tqwt_maxValue_dec_12,tqwt_kurtosisValue_dec_27
0,2.868933,0.013342,0.021728,0.004519,0.003775,-0.038598,-0.050074,0.039194,0.050074,1.5465
1,2.9091,0.015878,0.013229,0.004287,0.015735,-0.101033,-0.134677,0.098804,0.131953,7.049367
2,-0.431677,0.014602,0.013638,0.014023,0.040322,-0.069765,-0.153913,0.067104,0.147457,1.581967
3,0.497686,0.018224,0.016633,0.007439,0.034667,-0.068975,-0.136643,0.059328,0.136643,2.382533
4,3.3261,0.016386,0.016954,0.001815,0.005032,-0.015796,-0.023543,0.017359,0.023543,3.881267


### Preprocessing

In [7]:
kbest_df.fillna('6', inplace = True)

In [11]:
dummylist = []
for att in kbest_df:
    dummylist.append(pd.get_dummies(kbest_df[[att]]))
dummified_df = pd.concat(dummylist, axis=1)
dummified_df.head(5)

Unnamed: 0,mean_MFCC_2nd_coef,std_8th_delta_delta,std_9th_delta_delta,tqwt_stdValue_dec_11,tqwt_stdValue_dec_12,tqwt_minValue_dec_11,tqwt_minValue_dec_12,tqwt_maxValue_dec_11,tqwt_maxValue_dec_12,tqwt_kurtosisValue_dec_27
0,2.868933,0.013342,0.021728,0.004519,0.003775,-0.038598,-0.050074,0.039194,0.050074,1.5465
1,2.9091,0.015878,0.013229,0.004287,0.015735,-0.101033,-0.134677,0.098804,0.131953,7.049367
2,-0.431677,0.014602,0.013638,0.014023,0.040322,-0.069765,-0.153913,0.067104,0.147457,1.581967
3,0.497686,0.018224,0.016633,0.007439,0.034667,-0.068975,-0.136643,0.059328,0.136643,2.382533
4,3.3261,0.016386,0.016954,0.001815,0.005032,-0.015796,-0.023543,0.017359,0.023543,3.881267


In [16]:
# normalização [0,1]
from sklearn.preprocessing import Normalizer
transf = Normalizer().fit(dummified_df)
dummified_df = pd.DataFrame(transf.transform(dummified_df, copy=True), columns= dummified_df.columns)
dummified_df.describe(include='all')

Unnamed: 0,mean_MFCC_2nd_coef,std_8th_delta_delta,std_9th_delta_delta,tqwt_stdValue_dec_11,tqwt_stdValue_dec_12,tqwt_minValue_dec_11,tqwt_minValue_dec_12,tqwt_maxValue_dec_11,tqwt_maxValue_dec_12,tqwt_kurtosisValue_dec_27
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,0.302892,0.004235,0.004085,0.002801,0.004049,-0.016852,-0.021464,0.01662,0.021469,0.865217
std,0.354151,0.003588,0.003423,0.004811,0.006477,0.02331,0.026759,0.022569,0.026715,0.175358
min,-0.497584,5e-05,4.8e-05,1.6e-05,4.3e-05,-0.168722,-0.194626,0.00011,0.000196,0.305166
25%,0.012489,0.000883,0.00087,0.000355,0.000635,-0.0228,-0.030478,0.002362,0.003616,0.768871
50%,0.210387,0.003727,0.003694,0.001056,0.001911,-0.008105,-0.011284,0.007995,0.011465,0.956234
75%,0.635411,0.006655,0.006722,0.003242,0.004859,-0.002502,-0.003658,0.021778,0.030182,0.997323
max,0.952222,0.016608,0.015556,0.037891,0.060348,-0.000111,-0.000196,0.154475,0.194626,0.999999


In [35]:

attributes = [attr[0] for attr in dummified_df['attributes']]
df = pd.DataFrame(data=dummified_df['data'], columns=attributes)
df.head(5)



newdf = dummified_df.copy()
for col in newdf:
    if col not in ['class','a01','a02']: 
        newdf[col] = pd.cut(newdf[col],3,labels=['0','1','2'])
newdf.head(5)

KeyError: 'attributes'

### Patern mining


### Frequent itemset mining and association rule mining

In [17]:
frequent_itemsets = {}
minpaterns = 30
minsup = 1.0
while minsup>0:
    minsup = minsup*0.9
    frequent_itemsets = apriori(dummified_df, min_support=0.3, use_colnames=True)
    if len(frequent_itemsets) >= minpaterns:
        print("Minimum support:",minsup)
        break
print("Number of found patterns:",len(frequent_itemsets))

ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value 0.8798932788076287