### Libraries

In [1]:
import pandas as pd
import numpy as np

In [36]:
import os

os.environ["path"] = r"C:\Users\Juan Pablo Pelaez\Documents\PyDev-Operational-Library-For-Data-Engineers"

os.environ["records_folder"] = r'C:\Users\Juan Pablo Pelaez\Documents\Training\Training APIs'
os.environ["df_file_name"] = 'all-contacts.csv'
os.environ["account_id"] = '48749153'
os.environ["add_hubspot_records_view"] = 'Yes'

# It is not necessary to fill in all of the following fields
os.environ["full_name_column_name"] = 'Full Name'
os.environ["phone_number_column_name"] = 'Phone Number' # could be empty ''
os.environ["mobile_phone_number_column_name"] = 'Mobile Phone Number' # could be empty ''
os.environ["street_address_column_name"] = 'Street Address' # could be empty ''
os.environ["list_of_columns"] = ''#'Company Domain Name' # could be empty '' or several properties separed with a ; 'Company Domain;Phone Number'

### Data Team Libraries

In [37]:
path = r"C:\Users\Juan Pablo Pelaez\Documents\PyDev-Operational-Library-For-Data-Engineers"

In [38]:
import sys
sys.path.insert(0,path)

from functions.manage_duplicates.find_duplicate_contacts_by_name_and_phone import find_duplicate_contacts_by_name_and_phone
from functions.manage_duplicates.find_duplicate_contacts_by_email import find_duplicate_contacts_by_email
from functions.manage_duplicates.find_duplicate_contacts_by_specified_fields import find_duplicate_contacts_by_specified_fields
from functions.data_transformations.delete_unnecessary_blank_spaces import delete_unnecessary_blank_spaces

from functions.manage_duplicates.standardize_data.standardize_contacts_full_name import standardize_contacts_full_name

## Input Data Frame

In [39]:
file_path = os.getenv('records_folder')

In [40]:
df = pd.read_csv(file_path+'//'+os.getenv('df_file_name'), low_memory=False)

#### Standarize empty spaces

In [41]:
df = df.replace(np.nan, '')

### If Name is split in two fields

In [42]:
full_name_column = os.getenv('full_name_column_name')

In [43]:
df[full_name_column] = df['First Name'] + ' ' + df['Last Name']

df[full_name_column] = df[full_name_column].apply(delete_unnecessary_blank_spaces)

  df[full_name_column] = df['First Name'] + ' ' + df['Last Name']


In [44]:
df['Standardized Full Name'] = df['Full Name'].apply(standardize_contacts_full_name)

  df['Standardized Full Name'] = df['Full Name'].apply(standardize_contacts_full_name)


## Run Find Duplicates Funcion

In [45]:
frame_phone_column_name = os.getenv('phone_number_column_name')
street_address_column_name = os.getenv('street_address_column_name')
mobile_phone_number_column_name = os.getenv('mobile_phone_number_column_name')

In [46]:
df['original mail']=df['Email']


  df['original mail']=df['Email']


In [47]:
duplicates_frame, no_duplicates_frame = find_duplicate_contacts_by_email(df,'Email')

In [48]:
print('{} Duplicated Records Found'.format(len(duplicates_frame)))

279 Duplicated Records Found


# Add HubSpot Records View and Save Duplicates Frame

In [18]:
def link_to_view_record(record_id):
    
    account_id = os.getenv('account_id') ## Change this according your account
    record_id = str(int(record_id))
    
    record_view = 'https://app.hubspot.com/contacts/{}/record/0-1/{}'.format(account_id, record_id)
    
    return record_view

In [19]:
add_hubspot_records_view = os.getenv('add_hubspot_records_view')

In [20]:
names = [street_address_column_name,frame_phone_column_name]

In [21]:
for dataframe, name in zip([street, phone],names):
    if add_hubspot_records_view == 'Yes':
        dataframe['Record View'] = dataframe['Record ID'].apply(link_to_view_record)
    else:
        pass

    dataframe[['Record ID', 'First Name', 'Last Name', 'Email',
                                         frame_phone_column_name, street_address_column_name, 
                                         'Record View', 'Key']].to_excel(file_path+r'\Duplicate Contact Records Found by {}.xlsx'.format(name))

    print(len(dataframe), 'Duplicated Records Found')

4 Duplicated Records Found
2 Duplicated Records Found


# If few duplicates are found with the previous methods, ask the strategies if the following strategies sound good

### Email domain (Same name and same email domain, mostly for same companies, but might mark as duplicates people with the same name within the same company)

In [36]:
excluded_domains = ['gmail.com','yahoo.com','aol.com','hotmail.com','icloud.com','comcast.net','msn.com','outlook.com']

In [None]:
df['Email domain key'] = df['Standardized Full Name']+df['Email Domain']

In [26]:
df[df.duplicated(subset=['Email Domain','Email domain key'],keep=False)&(df['Standardized Full Name']!='')&(~df['Email Domain'].isin(excluded_domains))&(df['Email Domain']!='')][['Record ID', 'First Name', 'Last Name',frame_phone_column_name, 'Email', street_address_column_name,'Email domain key']].sort_values('Email domain key').to_excel(file_path+'\\Duplicate contacts by name and email domain.xlsx',index=False)

# Without info (This is useful when there's a lot of contacts without information)

### This generates a dataframe whose name is duplicated but have information, at least one of email, street address or phone number

In [29]:
with_info = df[(df.duplicated(subset='Standardized Full Name',keep=False))&(df['Standardized Full Name']!='')&((df['Email']!='')|(df['Street Address']!='')|(df['Phone Number']!=''))]

### This generates a dataframe that don't have information but that could be merged into a contact with information

In [30]:
no_info = df[(df['Email']!='')&(df.duplicated(subset='Standardized Full Name',keep=False))&(df['Street Address']=='')&(df['Phone Number']=='')&((df['Standardized Full Name'].isin(with_info['Standardized Full Name'])))]

In [35]:
no_info[['Record ID', 'First Name', 'Last Name', 'Email',
                                         frame_phone_column_name, 'Email', street_address_column_name, 
                                         'Standardized Full Name']]#.to_excel(file_path+'\\'+'Contacts duplicated by name with but no phone or address.xlsx',index=False)

Unnamed: 0,Record ID,First Name,Last Name,Email,Phone Number,Email.1,Street Address,Standardized Full Name
1309,411527,Charles,Adair,cadair@cxtec.com,,cadair@cxtec.com,,charlesadair
1565,408800,Jason,Adams,sonoftexas70@protonmail.com,,sonoftexas70@protonmail.com,,jasonada
1636,409194,Kim,Adams,kimmadamspa@gmail.com,,kimmadamspa@gmail.com,,kimada
1726,407823,Patricia,Adams,pladams213@gmail.com,,pladams213@gmail.com,,patriciaada
1975,407061,Marie,Adeniyi,marie.adeniyi@gmail.com,,marie.adeniyi@gmail.com,,marieadeniyi
...,...,...,...,...,...,...,...,...
300500,408922,Tulio,Zandomenego,tzando@gmail.com,,tzando@gmail.com,,tuliozandomenego
300501,411035,Tulio,Zandomenego,tzando@verizon.net,,tzando@verizon.net,,tuliozandomenego
301271,407574,Rima,Zigaitis,rimazig1@gmail.com,,rimazig1@gmail.com,,rimazigaitis
301646,410274,Brian,Zlotorzynski,zmazingfish@gmail.com,,zmazingfish@gmail.com,,brianzlotorzynski


In [33]:
with_info[['Record ID', 'First Name', 'Last Name', 'Email',
                                         frame_phone_column_name, 'Email', street_address_column_name, 
                                         'Standardized Full Name']]#.to_excel(file_path+'\\'+'Contacts that could be main contacts for duplicates without information.xlsx',index=False)

Unnamed: 0,Record ID,First Name,Last Name,Email,Phone Number,Email.1,Street Address,Standardized Full Name
202,383304,Barbara,‘Brown,bkbnboulder@gmail.com,9099082910,bkbnboulder@gmail.com,496 Aaron Way,barbarabrown
461,283777,Paul,A. Hill,dhill1228@yahoo.com,4104286800,dhill1228@yahoo.com,10028 Fox Den Ct,paulahill
592,343844,Angie,Abbott,angieabbott13@gmail.com,5732309330,angieabbott13@gmail.com,12110 County Road 4049,angieabbott
593,485796,Angie,Abbott,angiedabbott@gmail.com,+1 (832) 514-5257,angiedabbott@gmail.com,,angieabbott
607,118838,Donna,Abbott,donna@lambertconstructionco.com,4057439677,donna@lambertconstructionco.com,5911 East 15th Avenue,donnaabbott
...,...,...,...,...,...,...,...,...
301594,368216,Ronald,Zirkle,itslocks@gmail.com,8034937800,itslocks@gmail.com,4843 old hickory rd,ronaldzirkle
301645,484458,Brian,Zlotorzynski,brian_zlotorzynski@comcast.net,4104744046.0,brian_zlotorzynski@comcast.net,1450 west mount harmony road,brianzlotorzynski
301646,410274,Brian,Zlotorzynski,zmazingfish@gmail.com,,zmazingfish@gmail.com,,brianzlotorzynski
301797,490327,Sandi,Zorinich,sandi357@hotmail.com,+1 (925) 895-1299,sandi357@hotmail.com,,sandizorinich


In [None]:
df[(df['Email']=='')&(df['Street Address']=='')&(df['Phone Number']=='')]#.to_excel(file_path+'\\Contacts without street address, email or phone number.xlsx',index=False)

In [49]:
duplicates_to_review = duplicates_frame[['Record ID', 'First Name', 'Last Name',
                                         'Phone Number', 'Email', 'original mail',
                                         'Create Date','Last Activity Date',
                                         ]]

In [50]:
duplicates_to_review.to_excel(file_path+r'\Duplicate Contact Records Found.xlsx',index=False)