# Mini projet Qualité de Données : Détections des doublons
## ***Christophe COMPAIN / Sander COHEN***

### Objectif et Données Disponibles
L'objectif du projet est d'identifier les logiciels vendus sur les deux plateformes.

Pour ce faire, nous disposons des données pour chacune des plateformes isolément, respectivement dans les fichiers ***Company1.csv*** et ***Company2.csv***. 

### Import packages, Variables Globales et import csv

In [1]:
import pandas as pd
import nltk
import time
import numpy as np
import math
import re
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scohe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\scohe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
path = "D:\\OneDrive - Université Paris-Dauphine\\Bureau\\Cours Master\\12-Qualité de Données\\\Projet\\mini-projet\\"
file1= "Data\\Company1.csv" #"SampleData\\Sample_Company1.csv"
file2= "Data\\Company2.csv" #"SampleData\\Sample_Company2.csv"
real= "Data\\Ground_truth_mappings.csv" #"SampleData\\Sample_Groud_truth_mappings.csv"

In [3]:
company1 = pd.read_csv(path+file1, encoding = "ISO-8859-1")
company2 = pd.read_csv(path+file2, encoding = "ISO-8859-1")
ground_truth_matches = pd.read_csv(path+real, encoding = "ISO-8859-1").drop_duplicates()

### Exploration des données

In [4]:
company1.head(5)

Unnamed: 0,id,title,description,manufacturer,price
0,b000jz4hqo,clickart 950 000 - premier image pack (dvd-rom),,broderbund,0.0
1,b0006zf55o,ca international - arcserve lap/desktop oem 30pk,oem arcserve backup v11.1 win 30u for laptops ...,computer associates,0.0
2,b00004tkvy,noah's ark activity center (jewel case ages 3-8),,victory multimedia,0.0
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99
4,b0006se5bq,singing coach unlimited,singing coach unlimited - electronic learning ...,carry-a-tune technologies,99.99


In [5]:
company2.head(5)

Unnamed: 0,id,name,description,manufacturer,price
0,11125907881740407428,learning quickbooks 2007,learning quickbooks 2007,intuit,38.99
1,11538923464407758599,superstart! fun with reading & writing!,fun with reading & writing! is designed to hel...,,8.49
2,11343515411965421256,qb pos 6.0 basic software,qb pos 6.0 basic retail mngmt software. for re...,intuit,637.99
3,12049235575237146821,math missions: the amazing arcade adventure (g...,save spectacle city by disrupting randall unde...,,12.95
4,12244614697089679523,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,805.99


In [6]:
ground_truth_matches.head(5)

Unnamed: 0,idCompany1,idCompany2
0,b000jz4hqo,18441480711193821750
1,b00004tkvy,18441110047404795849
2,b000g80lqo,18441188461196475272
3,b0006se5bq,18428750969726461849
4,b00021xhzw,18430621475529168165


#### Observation d'un premier duplicat

In [7]:
company1[company1.id == ground_truth_matches.idCompany1[1]]

Unnamed: 0,id,title,description,manufacturer,price
2,b00004tkvy,noah's ark activity center (jewel case ages 3-8),,victory multimedia,0.0


In [8]:
company2[company2.id == ground_truth_matches.idCompany2[1]]

Unnamed: 0,id,name,description,manufacturer,price
1881,18441110047404795849,the beginners bible: noah's ark activity cente...,,,9.95


In [9]:
stop_words = set(nltk.corpus.stopwords.words('english'))  
stop_words.update(["r","v","software","entertainment","inc","usa"])

def prep(texte):
    #suppression des caracteres non alphanumériques + tout en minuscule
    texte = re.sub("[^a-zA-Z0-9_]", " ",str(texte)).lower()
    #remplacement de mots
    texte = texte.replace("professional", "pro").replace(" upg "," upgrade ").replace(" dlx "," deluxe ")
    #tokenization par mot
    tokens = nltk.word_tokenize(texte)
    #supreesion des stopwords
    filtered_tokens = [w for w in tokens if not w in stop_words]
#    # Stemming
#    texte = [nltk.stem.SnowballStemmer('english').stem(w) for w in filtered_tokens]
    # Lemmatization
    texte = [nltk.stem.WordNetLemmatizer().lemmatize(w) for w in filtered_tokens]
    #remise sous forme d'une string
    return " ".join(texte)
        

In [10]:
##retraitement des prix
def retreatprice(texte):
    #suppression des caracteres non alphanumériques + tout en minuscule
    return float(re.sub("[^0-9.]", " ",str(texte)))


In [11]:
company1['Company']="company1"
company1=company1.rename(columns={"title": "name"})
company1['name'] = company1['name'].fillna(' ')
company1['manufacturer'] = company1['manufacturer'].fillna(' ')
company1['description'] = company1['description'].fillna(' ')
company1['price'] = company1['price'].fillna(' ')
company1['price_retreat'] = company1['price'].apply(retreatprice)
company1['full data']=company1['manufacturer'].apply(prep) + ' ' + company1['name'].apply(prep) # + ' ' + company1['description'].apply(prep)

company2['Company']="company2"
company2['name'] = company2['name'].fillna(' ')
company2['manufacturer'] = company2['manufacturer'].fillna(' ')
company2['description'] = company2['description'].fillna(' ')
company2['price'] = company2['price'].fillna(' ')
company2['price_retreat'] = company2['price'].apply(retreatprice)
company2['full data']=company2['manufacturer'].apply(prep) + ' ' + company2['name'].apply(prep) # + ' ' + company2['description'].apply(prep)


In [12]:
corpus = pd.concat([company1, company2],sort=False,ignore_index=True)
#corpus.reset_index(drop=True)
len(corpus)
corpus.tail()

Unnamed: 0,id,name,description,manufacturer,price,Company,price_retreat,full data
4584,14872602878188858026,jumpstart(r) advanced 1st grade,prepare your child for the 1st grade and beyon...,,19.99,company2,19.99,jumpstart advanced 1st grade
4585,14916162814320983138,ibm(r) viavoice(r) advanced edition 10,ibm viavoice advanced edition release 10 is a ...,,78.95,company2,78.95,ibm viavoice advanced edition 10
4586,14974113209571399013,xbox 360: gears of war,as marcus fenix you fight a war against the im...,,59.99,company2,59.99,xbox 360 gear war
4587,14986935400648190776,documents to go premium 7.0,this pda software enables you to use your docu...,,49.99,company2,49.99,document go premium 7 0
4588,14996991014087320062,microsoft(r) picture it! digital image pro 9.0,picture it! digital image pro puts you in cont...,,99.87,company2,99.87,microsoft picture digital image pro 9 0


In [13]:
#recherche des mots unique pour les supprimer
allwords = corpus['full data'].str.split(expand=True).stack().value_counts()
stop_unique = set(allwords[allwords==1].index)

def prep2(texte):
    tokens = nltk.word_tokenize(texte)
    #supreesion des stopwords
    filtered_tokens = [w for w in tokens if not w in stop_words]
    #remise sous forme d'une string
    return " ".join(filtered_tokens)
        

In [14]:
company1['full data']=company1['full data'].apply(prep)
company2['full data']=company2['full data'].apply(prep)

In [15]:
#company1_light = company1[company1['full data'].str.contains(filtre)].reset_index(drop=True)
#company2_light

In [16]:
###données punch software
filtre = "punch"
#stopwords_suppl =" software"
company1_light = company1[company1['full data'].str.contains(filtre)].reset_index(drop=True)
company2_light = company2[company2['full data'].str.contains(filtre)].reset_index(drop=True)
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.5,sublinear_tf=True,stop_words=[filtre])#+stopwords_suppl]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

number_of_matches = 0
matches=[]
start = time.process_time()
for i in range(len(company1_light)):
    try :  
        price1 = float(company1_light.iloc[i,6]) 
    except : 
        price1 = 0
    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
        try :  
            price2 = float(company2_light.iloc[j,6]) 
        except : 
            price2 = 0
        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if  ((similarity > 0.35)) :#or jd_ng1_ng2_name<0.1 :# or name_score<=1) :
                number_of_matches = number_of_matches +1
                matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))
print("Number of matches: {}".format(number_of_matches))
matches_df = pd.DataFrame(matches)
matches_df.columns= ['idCompany1','idCompany2']
diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

Number of matches: 57
Processing time: 0.25
Number of true positives: 28
Number of false positives: 29
Number of false negatives: 1272
Precision: 0.49122807017543857
Recall: 0.021538461538461538
F measure: 0.041267501842299194


In [17]:
###données topics
filtre = "topic"
#stopwords_suppl =" entertainment"
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
company1_light = company1_light[company1_light['full data'].str.contains(filtre)].reset_index(drop=True)
company2_light = company2_light[company2_light['full data'].str.contains(filtre)].reset_index(drop=True)
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.05,sublinear_tf=True,stop_words=[filtre]) #+stopwords_suppl]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
    try :  
        price1 = float(company1_light.iloc[i,6]) 
    except : 
        price1 = 0
    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
        try :  
            price2 = float(company2_light.iloc[j,6]) 
        except : 
            price2 = 0
        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if  ((similarity > 0.3)) or jd_ng1_ng2_name<0.2 :# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))
print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']
matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)

diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 48
Total matches: 105
Processing time: 0.734375
Number of true positives: 58
Number of false positives: 47
Number of false negatives: 1242
Precision: 0.5523809523809524
Recall: 0.04461538461538461
F measure: 0.08256227758007116


In [18]:
###données apple
filtre = "apple"
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
company1_light = company1_light[company1_light['full data'].str.contains(filtre)].reset_index(drop=True)
company2_light = company2_light[company2_light['full data'].str.contains(filtre)].reset_index(drop=True)
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.1,sublinear_tf=True,stop_words=[filtre]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
    try :  
        price1 = float(company1_light.iloc[i,6]) 
    except : 
        price1 = 0
    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
        try :  
            price2 = float(company2_light.iloc[j,6]) 
        except : 
            price2 = 0
        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if  ((similarity > 0.4)) or jd_ng1_ng2_name<0.5 :# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))
print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']
matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)

diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 66
Total matches: 171
Processing time: 0.609375
Number of true positives: 106
Number of false positives: 65
Number of false negatives: 1194
Precision: 0.6198830409356725
Recall: 0.08153846153846153
F measure: 0.1441196464989803


In [19]:
###données Encore
filtre = "encore"
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
company1_light = company1_light[company1_light['full data'].str.contains(filtre)].reset_index(drop=True)
company2_light = company2_light[company2_light['full data'].str.contains(filtre)].reset_index(drop=True)
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.01,sublinear_tf=True,stop_words=[filtre]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
    try :  
        price1 = float(company1_light.iloc[i,6]) 
    except : 
        price1 = 0
    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
        try :  
            price2 = float(company2_light.iloc[j,6]) 
        except : 
            price2 = 0
        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if ((similarity > 0.2)) or jd_ng1_ng2_name<0.2 :# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))
print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']
matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)

diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 132
Total matches: 303
Processing time: 7.328125
Number of true positives: 198
Number of false positives: 105
Number of false negatives: 1102
Precision: 0.6534653465346535
Recall: 0.1523076923076923
F measure: 0.24703680598877104


In [20]:
###données Adobe
filtre = "adobe"
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
company1_light = company1_light[company1_light['full data'].str.contains(filtre)].reset_index(drop=True)
company2_light = company2_light[company2_light['full data'].str.contains(filtre)].reset_index(drop=True)
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.5,sublinear_tf=True,stop_words=[filtre]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
    try :  
        price1 = float(company1_light.iloc[i,6]) 
    except : 
        price1 = 0
    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
        try :  
            price2 = float(company2_light.iloc[j,6]) 
        except : 
            price2 = 0
        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if ((similarity > 0.6)) :# jd_ng1_ng2_name<0.3 :# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))
print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']
matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)

diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 86
Total matches: 389
Processing time: 4.328125
Number of true positives: 247
Number of false positives: 142
Number of false negatives: 1053
Precision: 0.6349614395886889
Recall: 0.19
F measure: 0.29248075784487865


In [21]:
###données microsoft
filtre = "microsoft"
#stopwords_suppl =" software"
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
company1_light = company1_light[company1_light['full data'].str.contains(filtre)].reset_index(drop=True)
company2_light = company2_light[company2_light['full data'].str.contains(filtre)].reset_index(drop=True)
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.5,sublinear_tf=True,stop_words=[filtre]) #+stopwords_suppl]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
    try :  
        price1 = float(company1_light.iloc[i,6]) 
    except : 
        price1 = 0
    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
        try :  
            price2 = float(company2_light.iloc[j,6]) 
        except : 
            price2 = 0
        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if ((similarity > 0.45)):# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))

print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']
matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
                
diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 67
Total matches: 456
Processing time: 4.171875
Number of true positives: 278
Number of false positives: 178
Number of false negatives: 1022
Precision: 0.6096491228070176
Recall: 0.21384615384615385
F measure: 0.3166287015945331


In [22]:
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,3), max_df=0.01,sublinear_tf=True)#,stop_words=["software"]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
#    try :  
    price1 = float(company1_light.iloc[i,6]) 
#    except : 
#        price1 = 0
#    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
#    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
#        try :  
        price2 = float(company2_light.iloc[j,6]) 
#        except : 
#            price2 = 0
#        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
#        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
#        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if ((similarity > 0.5)):# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))



print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']


matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
#matches_df_temp = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
                
diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
#diff_df = pd.merge(ground_truth_matches, matches_df_temp, how='outer', indicator='Exist')


true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 587
Total matches: 1043
Processing time: 660.5
Number of true positives: 674
Number of false positives: 369
Number of false negatives: 626
Precision: 0.6462128475551294
Recall: 0.5184615384615384
F measure: 0.5753307725138711


In [23]:
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.01,sublinear_tf=True)#,stop_words=["software"]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
#    try :  
    price1 = float(company1_light.iloc[i,6]) 
#    except : 
#        price1 = 0
#    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
#    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
#        try :  
        price2 = float(company2_light.iloc[j,6]) 
#        except : 
#            price2 = 0
#        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
#        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
#        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if ((similarity > 0.5)):# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))



print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']


matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
###matches_df_temp = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
                
diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
###diff_df = pd.merge(ground_truth_matches, matches_df_temp, how='outer', indicator='Exist')


true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 163
Total matches: 1206
Processing time: 294.703125
Number of true positives: 786
Number of false positives: 420
Number of false negatives: 514
Precision: 0.6517412935323383
Recall: 0.6046153846153847
F measure: 0.627294493216281


In [28]:
company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df=0.01,sublinear_tf=True)#,stop_words=["software"]) #ngram_range=(1),
vectors = vectorizer.fit_transform(corpus['full data'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()

new_number_of_matches = 0
new_matches=[]
start = time.process_time()
for i in range(len(company1_light)):
#    try :  
    price1 = float(company1_light.iloc[i,6]) 
#    except : 
#        price1 = 0
#    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
#    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
    for j in range(len(company2_light)):
#        try :  
        price2 = float(company2_light.iloc[j,6]) 
#        except : 
#            price2 = 0
#        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
#        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
#        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
            try :
                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
            except : 
                similarity = 0
            if ((similarity > 0.5)):# or name_score<=1) :
                new_number_of_matches = new_number_of_matches +1
                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))



print("New matches: {}".format(new_number_of_matches))
number_of_matches= number_of_matches + new_number_of_matches
print("Total matches: {}".format(number_of_matches))
new_matches_df = pd.DataFrame(new_matches)
new_matches_df.columns= ['idCompany1','idCompany2']


matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
###matches_df_temp = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
                
diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
###diff_df = pd.merge(ground_truth_matches, matches_df_temp, how='outer', indicator='Exist')


true_positives = diff_df[diff_df.Exist=='both']
false_positives = diff_df[diff_df.Exist=='right_only']
false_negatives = diff_df[diff_df.Exist=='left_only']
end = time.process_time()
print("Processing time: {}".format(end - start))
print("Number of true positives: {}".format(len(true_positives)))
print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
precision = len(true_positives)/(len(true_positives)+ len(false_positives))
print("Precision: {}".format(precision))
recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
print("Recall: {}".format(recall))
f_measure = 2*(precision*recall)/(precision+recall)
print("F measure: {}".format(f_measure))

New matches: 297
Total matches: 2313
Processing time: 40.046875
Number of true positives: 931
Number of false positives: 572
Number of false negatives: 369
Precision: 0.6194278110445776
Recall: 0.7161538461538461
F measure: 0.6642882625758117


In [29]:
#company1_light=company1[~company1.id.isin(matches_df.idCompany1)]
#company2_light=company2[~company2.id.isin(matches_df.idCompany2)]
#corpus = pd.concat([company1_light, company2_light],sort=False,ignore_index=True)

In [36]:
#vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df=0.01,sublinear_tf=True)#,stop_words=["software"]) #ngram_range=(1),
#vectors = vectorizer.fit_transform(corpus['full data'])
#feature_names = vectorizer.get_feature_names()
#dense = vectors.todense()

#new_number_of_matches = 0
#new_matches=[]
#start = time.process_time()
#for i in range(len(company1_light)):
##    try :  
#    price1 = float(company1_light.iloc[i,6]) 
##    except : 
##        price1 = 0
##    tokens1name = nltk.word_tokenize(company1_light.iloc[i,7])
##    ng1_tokensname = set(nltk.ngrams(tokens1name, n=1))
#    for j in range(len(company2_light)):
##        try :  
#        price2 = float(company2_light.iloc[j,6]) 
##        except : 
##            price2 = 0
##        tokens2name = nltk.word_tokenize(company2_light.iloc[j,7])
##        ng2_tokensname = set(nltk.ngrams(tokens2name, n=1))
##        jd_ng1_ng2_name = nltk.jaccard_distance(ng1_tokensname, ng2_tokensname)
#        if price1* price2 == 0 or max(price1, price2)/min(price1, price2)<2:
#            try :
#                similarity = np.dot(dense[i],np.transpose(dense[len(company1_light)+j])).item(0)/math.sqrt(np.dot(dense[i],np.transpose(dense[i])).item(0) * np.dot(dense[len(company1_light)+j],np.transpose(dense[len(company1_light)+j])).item(0))
#            except : 
#                similarity = 0
#            if ((similarity > 0.45)):# or name_score<=1) :
#                new_number_of_matches = new_number_of_matches +1
#                new_matches.append((company1_light.iloc[i,0],company2_light.iloc[j,0]))



#print("New matches: {}".format(new_number_of_matches))
#number_of_matches= number_of_matches + new_number_of_matches
#print("Total matches: {}".format(number_of_matches))
#new_matches_df = pd.DataFrame(new_matches)
#new_matches_df.columns= ['idCompany1','idCompany2']


###matches_df = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
#matches_df_temp = pd.concat([matches_df, new_matches_df],sort=False,ignore_index=True)
                
###diff_df = pd.merge(ground_truth_matches, matches_df, how='outer', indicator='Exist')
#diff_df = pd.merge(ground_truth_matches, matches_df_temp, how='outer', indicator='Exist')


#true_positives = diff_df[diff_df.Exist=='both']
#false_positives = diff_df[diff_df.Exist=='right_only']
#false_negatives = diff_df[diff_df.Exist=='left_only']
#end = time.process_time()
#print("Processing time: {}".format(end - start))
#print("Number of true positives: {}".format(len(true_positives)))
#print("Number of false positives: {}".format(len(false_positives)))
#print("Number of false negatives: {}".format(len(false_negatives)))
#precision = len(true_positives)/(len(true_positives)+ len(false_positives))
#print("Precision: {}".format(precision))
#recall = len(true_positives)/(len(true_positives)+ len(false_negatives))
#print("Recall: {}".format(recall))
#f_measure = 2*(precision*recall)/(precision+recall)
#print("F measure: {}".format(f_measure))

New matches: 50
Total matches: 3212
Processing time: 19.34375
Number of true positives: 947
Number of false positives: 606
Number of false negatives: 353
Precision: 0.6097875080489376
Recall: 0.7284615384615385
F measure: 0.6638626007711181


In [None]:
base_false_negatives =false_negatives.merge(corpus.loc[corpus['Company'] == 'company1']
                                            .drop(['Company','name','manufacturer'], inplace=False, axis=1)
                                            .rename(columns = {'id': 'idCompany1','description': 'descr1',
                                                               'price': 'price1','full data': 'full data1'}
                                                    , inplace = False)
                                            , how='inner', on='idCompany1').merge(corpus.loc[corpus['Company'] == 'company2']
                                                                                  .drop(['Company','name','manufacturer'], inplace=False, axis=1)
                                                                                  .rename(columns = {'id': 'idCompany2', 
                                                                                                     'description': 'descr2', 
                                                                                                     'price': 'price2', 
                                                                                                     'full data': 'full data2'}, inplace = False)
                                                                                  , how='inner', on='idCompany2')

In [None]:
base_false_positives =false_positives.merge(corpus.loc[corpus['Company'] == 'company1']
                                            .drop(['Company','name','manufacturer'], inplace=False, axis=1)
                                            .rename(columns = {'id': 'idCompany1','description': 'descr1',
                                                               'price': 'price1','full data': 'full data1'}
                                                    , inplace = False)
                                            , how='inner', on='idCompany1').merge(corpus.loc[corpus['Company'] == 'company2']
                                                                                  .drop(['Company','name','manufacturer'], inplace=False, axis=1)
                                                                                  .rename(columns = {'id': 'idCompany2', 
                                                                                                     'description': 'descr2', 
                                                                                                     'price': 'price2', 
                                                                                                     'full data': 'full data2'}, inplace = False)
                                                                                  , how='inner', on='idCompany2')

In [None]:
base_false_positives


In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option("max_rows", None)
base_false_negatives