In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('out.csv', usecols=['id','title','artist','year','genres'])
df.id = df.id.drop_duplicates()
df = df[np.isfinite(df['id']) == True]
df = df[np.isfinite(df['year']) == True]
df.year = df.year.astype(int)
df = df[df['title'].astype(str) != " "]
df = df[df['artist'].astype(str) != " "]
df = df[(df['year'] >= 1999 ) == True]

In [2]:
df = pd.concat([df[col].astype(str).str.lower() for col in df.columns], axis=1)

In [3]:
df.to_csv('d_table.csv', index=False, encoding = 'utf-8')

In [72]:
df = pd.read_csv('pitchfork_reviews.csv', usecols =['id', 'title','artist','year'])
df.id = df.id.drop_duplicates()
df = df[np.isfinite(df['id']) == True]
df = pd.concat([df[col].astype(str).str.lower() for col in df.columns], axis=1)
df.to_csv('p_table.csv', index=False, encoding = 'utf-8')

In [5]:
import py_entitymatching as em
A = em.read_csv_metadata('p_table.csv', key='reviewid')
B = em.read_csv_metadata('d_table.csv', key='id')

In [38]:
ob = em.OverlapBlocker()

In [93]:
C = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, 
                    l_output_attrs=['title', 'artist', 'year'], 
                    r_output_attrs=['title', 'artist', 'year'],
                    show_progress=False)

In [94]:
ab = em.AttrEquivalenceBlocker()
D = ab.block_candset(C, 'title', 'title', show_progress=False)
D.to_csv('tuples_after_blocking', index = False, encoding = 'utf-8')

In [46]:
D = D.sample(n = 3000)
D.to_csv('sample.csv', index = False, encoding = 'utf-8')

In [48]:
df = pd.read_csv('labeled_tuples.csv')
df.label.value_counts()

1    2075
0     925
Name: label, dtype: int64

In [17]:
tmp = em.read_csv_metadata('tuples_after_blocking', 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_reviewid', fk_rtable='rtable_id')

In [20]:
F = em.get_features_for_matching(A, B)

In [10]:
S = em.read_csv_metadata('labeled_tuples.csv', 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_reviewid', fk_rtable='rtable_id')

In [11]:
IJ = em.split_train_test(S, train_proportion=0.6, random_state=0)
I = IJ['train']
J = IJ['test']

In [12]:
F = em.get_features_for_matching(A, B)
F.feature_name

0           title_title_jac_qgm_3_qgm_3
1       title_title_cos_dlm_dc0_dlm_dc0
2       title_title_jac_dlm_dc0_dlm_dc0
3                       title_title_mel
4                  title_title_lev_dist
5                   title_title_lev_sim
6                       title_title_nmw
7                        title_title_sw
8         artist_artist_jac_qgm_3_qgm_3
9     artist_artist_cos_dlm_dc0_dlm_dc0
10    artist_artist_jac_dlm_dc0_dlm_dc0
11                    artist_artist_mel
12               artist_artist_lev_dist
13                artist_artist_lev_sim
14                    artist_artist_nmw
15                     artist_artist_sw
Name: feature_name, dtype: object

In [13]:
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [75]:
any(pd.notnull(H))

True

In [14]:
H = em.impute_table(H, exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], strategy='mean')
#K.to_csv('set_J.csv', index = False, encoding = 'utf-8')

In [60]:
any(pd.notnull(H))

True

In [15]:
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NB')

In [90]:
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric='recall', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x0000023593574400>,5,0.988048,0.996109,0.995652,0.992509,0.988679,0.9922
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x0000023588605BE0>,5,0.996016,0.996109,1.0,0.985019,0.992453,0.993919
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x0000023593574978>,5,0.996016,1.0,1.0,0.996255,0.996226,0.997699
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x00000235886052E8>,5,0.988048,0.992218,1.0,0.985019,0.988679,0.990793
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x0000023588605BA8>,5,0.996016,0.996109,1.0,0.992509,0.992453,0.995417
5,NB,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x00000235886059B0>,5,0.992032,0.996109,1.0,0.985019,0.992453,0.993122


In [28]:
rf.fit(table=H, 
       exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(tmp, feature_table=F, show_progress=False)

# Predict on L 
predictions = rf.predict(table=L, exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id'], 
              append=True, target_attr='predicted', inplace=False)
predictions

Unnamed: 0,_id,ltable_reviewid,rtable_id,title_title_jac_qgm_3_qgm_3,title_title_cos_dlm_dc0_dlm_dc0,title_title_jac_dlm_dc0_dlm_dc0,title_title_mel,title_title_lev_dist,title_title_lev_sim,title_title_nmw,title_title_sw,artist_artist_jac_qgm_3_qgm_3,artist_artist_cos_dlm_dc0_dlm_dc0,artist_artist_jac_dlm_dc0_dlm_dc0,artist_artist_mel,artist_artist_lev_dist,artist_artist_lev_sim,artist_artist_nmw,artist_artist_sw,predicted
0,110,14183,221582.0,1.0,1.0,1.0,1.0,0.0,1.0,19.0,19.0,0.863636,0.866025,0.750000,0.897143,18.0,0.485714,-1.0,17.0,1
1,111,8784,198710.0,1.0,1.0,1.0,1.0,0.0,1.0,19.0,19.0,0.880000,0.894427,0.800000,0.897674,22.0,0.488372,-1.0,21.0,1
2,148,15387,304915.0,1.0,1.0,1.0,1.0,0.0,1.0,14.0,14.0,0.880000,0.894427,0.800000,0.897561,21.0,0.487805,-1.0,20.0,1
3,150,2140,16976.0,1.0,1.0,1.0,1.0,0.0,1.0,15.0,15.0,0.785714,0.816497,0.666667,0.894737,10.0,0.473684,-1.0,9.0,1
4,284,8147,43387.0,1.0,1.0,1.0,1.0,0.0,1.0,18.0,18.0,0.880000,0.866025,0.750000,0.897561,21.0,0.487805,-1.0,20.0,1
5,338,17155,490503.0,1.0,1.0,1.0,1.0,0.0,1.0,16.0,16.0,0.833333,0.816497,0.666667,0.896296,14.0,0.481481,-1.0,13.0,1
6,371,771,80438.0,1.0,1.0,1.0,1.0,0.0,1.0,25.0,25.0,0.906250,0.894427,0.800000,0.898182,28.0,0.490909,-1.0,27.0,1
7,373,10682,82678.0,1.0,1.0,1.0,1.0,0.0,1.0,13.0,13.0,0.833333,0.866025,0.750000,0.896774,16.0,0.483871,-1.0,15.0,0
8,374,9252,91775.0,1.0,1.0,1.0,1.0,0.0,1.0,22.0,22.0,0.800000,0.000000,0.000000,0.895238,11.0,0.476190,-1.0,10.0,1
9,399,11040,104882.0,1.0,1.0,1.0,1.0,0.0,1.0,17.0,17.0,0.833333,0.816497,0.666667,0.896296,14.0,0.481481,-1.0,13.0,1


In [91]:
dt.fit(table=H, 
       exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

# Predict on L 
predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False)

In [92]:
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 98.88% (798/807)
Recall : 99.13% (798/805)
F1 : 99.01%
False positives : 9 (out of 807 positive predictions)
False negatives : 7 (out of 393 negative predictions)


In [66]:
svm.fit(table=H, 
       exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

# Predict on L 
predictions = svm.predict(table=L, exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 97.92% (800/817)
Recall : 99.38% (800/805)
F1 : 98.64%
False positives : 17 (out of 817 positive predictions)
False negatives : 5 (out of 383 negative predictions)


In [68]:
lg.fit(table=H, 
       exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

# Predict on L 
predictions = lg.predict(table=L, exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 99.5% (796/800)
Recall : 98.88% (796/805)
F1 : 99.19%
False positives : 4 (out of 800 positive predictions)
False negatives : 9 (out of 400 negative predictions)


In [69]:
ln.fit(table=H, 
       exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

# Predict on L 
predictions = ln.predict(table=L, exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 99.5% (797/801)
Recall : 99.01% (797/805)
F1 : 99.25%
False positives : 4 (out of 801 positive predictions)
False negatives : 8 (out of 399 negative predictions)


In [77]:
nb = em.NBMatcher(name='NB')

In [78]:
nb.fit(table=H, 
       exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

# Predict on L 
predictions = nb.predict(table=L, exclude_attrs=['_id', 'ltable_reviewid', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False)
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 99.5% (797/801)
Recall : 99.01% (797/805)
F1 : 99.25%
False positives : 4 (out of 801 positive predictions)
False negatives : 8 (out of 399 negative predictions)
