## Test Recherche Correspondance

In [18]:
from thefuzz import fuzz
import pandas as pd
from pandas import DataFrame
# from fuzzywuzzy import fuzz

In [7]:
# Check the similarity score
name = "Kurtis Pykes"
full_name = "Kurtis K D Pykes"

print(f"Similarity score: {fuzz.ratio(name, full_name)}")

Similarity score: 86


In [10]:
# Check the similarity score
name = "Kurtis Pykes"
full_name = "Kurtis K D Pykes"

print(f"Similarity score: {fuzz.partial_ratio(name, full_name)}")

Similarity score: 74


In [11]:
# Check the similarity score
full_name = "Kurtis K D Pykes"
full_name_reordered = "Kurtis Pykes K D"

# Order does not matter for token sort ratio
print(f"Token sort ratio similarity score: {fuzz.token_sort_ratio(full_name_reordered, full_name)}")

# Order matters for partial ratio
print(f"Partial ratio similarity score: {fuzz.partial_ratio(full_name, full_name_reordered)}")

# Order will not effect simple ratio if strings do not match
print(f"Simple ratio similarity score: {fuzz.ratio(name, full_name)}")


Token sort ratio similarity score: 100
Partial ratio similarity score: 86
Simple ratio similarity score: 86


In [12]:
# Check the similarity score
name = "Kurtis Pykes"
full_name = "Kurtis K D Pykes"

print(f"Token sort ratio similarity score: {fuzz.token_set_ratio(name, full_name)}")

Token sort ratio similarity score: 100


In [13]:
from thefuzz import process

collection = ["AFC Barcelona", "Barcelona AFC", "barcelona fc", "afc barcalona"]
print(process.extract("barcelona", collection, scorer=fuzz.ratio))

[('barcelona fc', 86), ('AFC Barcelona', 82), ('Barcelona AFC', 82), ('afc barcalona', 73)]


In [15]:
import pandas as pd

# Creating a dataframe
dict_one = {
    "country": ["England", "Scotland", "Wales", "United Kingdom", "Northern Ireland"],
    "population_in_millions": [55.98, 5.45, 3.14, 67.33, 1.89]
}

dict_two = {
    "country": ["Northern Iland", "Wles", "Scotlnd", "Englnd", "United K."],
    "GDP_per_capita": [24900, 23882, 37460, 45101, 46510.28]
}

existing_data = pd.DataFrame(dict_one)
exported_data = pd.DataFrame(dict_two)

In [16]:
# Rename the misspelled columns
exported_data["country"] = exported_data["country"].apply(
    lambda x: process.extractOne(x, existing_data["country"], scorer=fuzz.partial_ratio)[0]
)

# Attempt to join the two dataframe
data = pd.merge(existing_data, exported_data, on="country", how="left")
data.head()

Unnamed: 0,country,population_in_millions,GDP_per_capita
0,England,55.98,45101.0
1,Scotland,5.45,37460.0
2,Wales,3.14,23882.0
3,United Kingdom,67.33,46510.28
4,Northern Ireland,1.89,24900.0


In [17]:
default_clients_list = [
    'ACR AFMA', 'AGS', 'ALLIANZ AFRICA HOLDING GMBH', 'APBEF CI', 'ARTCI', 'ASSA ABLOY',
    'BANQUE ATLANTIQUE CI', 'BOUCHARD CI', 'BPIFRANCE', 'BUREAU VERITAS', 'CAIMPEX',
    'CATERPILAR', 'CEMOI CHOCOLAT', 'CEMOI-CI', 'CHARGEL', 'CIWA', 'COLAS AFRIQUE SUCCURSALE DE CO',
    'DOLIDOL', 'EDF SAVANT GROUP', 'EDF-CI', 'EXCO', 'FAN MILK', 'FESI-FAYAT GROUP', 'FLEETI',
    'FOUNDEVER', 'FRUINOV CI', 'HAUTES ETUDES COMMERCIALES', 'HOTEL GRIFFON', 'IGP', 'INTERBAT',
    'INTERTEK MINERALS LIMITED', 'JESA INTERNATIONAL SA', 'LOUIS DREYFUS COMPANY CI',
    'M TARGET TELECOM', 'NBCI', 'NGE CONTRACTING', 'NOVARTIS COTE D\'IVOIRE SASU',
    'OBTIMA-BEG INGENERIE', 'P&N COTE D\'IVOIRE', 'PCM', 'PROJEX AFRIQUE DE L\'OUEST', 'PROSUMA',
    'ROCHE CI', 'SATECO', 'SCB', 'SCHNEIDER', 'SEURECA', 'SIR', 'SISAG', 'SLB', 'SMT CI',
    'SO-B-GREEN CI', 'SODIA', 'SPAC EN COTE D\'IVOIRE', 'SPIE BATIGNOLES', 'SUSU CI', 'TECTRA CI',
    'TERMINAL HUILIER DE VRIDI', 'TEVIA ENERGIE', 'TotalEnergies Marketing CI SA', 'TRANSMED',
    'VEOLIA', 'VINCI', 'WAVE CI', 'WEBHELP COTE D\'IVOIRE'
]

In [25]:
def filter_non_clients(dataframe: DataFrame, clients_list: list = default_clients_list, threshold: int = 85) -> DataFrame:
    def is_client(company_name: str) -> bool:
        for client in clients_list:
            if fuzz.token_set_ratio(company_name.lower(), client.lower()) >= threshold:
                return True
        return False

    return dataframe[~dataframe['current_company_name'].apply(is_client)]

In [26]:
data = {
    'lead_name': ['Lead 1', 'Lead 2', 'Lead 3', 'Lead 4', 'Lead 5'],
    'current_company_name': ['ACR AFMA', 'Example Company', 'ALLIANZ AFRICA HOLDING GMBH', 'Another Company', 'TERMINAL HUILIER DE VRIDI']
}
df = pd.DataFrame(data)

df

Unnamed: 0,lead_name,current_company_name
0,Lead 1,ACR AFMA
1,Lead 2,Example Company
2,Lead 3,ALLIANZ AFRICA HOLDING GMBH
3,Lead 4,Another Company
4,Lead 5,TERMINAL HUILIER DE VRIDI


In [27]:
filtered_df = filter_non_clients(df)

filtered_df

Unnamed: 0,lead_name,current_company_name
1,Lead 2,Example Company
3,Lead 4,Another Company
