This notebook is used to get email from DropContact API using first name,
last name and company name. The input data is from a Google Sheet.

In [55]:
!sudo /bin/bash -c "(source /venv/bin/activate; pip install --upgrade google-api-python-client)"

Requirement already up-to-date: google-api-python-client in /venv/lib/python3.9/site-packages (2.108.0)


# Import

In [63]:
import time
from math import ceil
from typing import Iterable, List

import gspread_pandas
import pandas as pd
import requests
from tqdm import tqdm

import helpers.hgoogle_file_api as hgofiapi

# Get data from Google Sheet

In [78]:
# Set up the Google sheet name.
gsheet_name = "Search7.AI_VC_in_US_DropContact_Test"
#
creds = hgofiapi.get_credentials()
spread = gspread_pandas.Spread(gsheet_name, creds=creds)
df = spread.sheet_to_df(index=None)[:10]
print(df.shape)
df.head()

(10, 31)


Unnamed: 0,profileUrl,fullName,firstName,lastName,companyName,title,companyId,companyUrl,regularCompanyUrl,summary,...,connectionDegree,profileImageUrl,sharedConnectionsCount,name,vmid,linkedInProfileUrl,isPremium,isOpenLink,query,timestamp
0,https://www.linkedin.com/sales/lead/ACwAAARSUP...,Johanan Ottensooser,Johanan,Ottensooser,Point72 Ventures,Operating Partner - Enterprise,15231926.0,https://www.linkedin.com/sales/company/15231926,https://www.linkedin.com/company/15231926,"Operating Partner at Point72 Ventures, focusin...",...,2nd,https://media.licdn.com/dms/image/C4D03AQFdrLn...,1,Johanan Ottensooser,ACwAAARSUPYBskladFoTfZ9U3JhrK9Oi0EsqA6Y,https://www.linkedin.com/in/ACwAAARSUPYBskladF...,True,True,https://www.linkedin.com/sales/search/people?c...,2023-11-16T00:11:10.158Z
1,https://www.linkedin.com/sales/lead/ACwAACX30V...,Chase Garbers,Chase,Garbers,Swing Capital,Venture Partner,86833445.0,https://www.linkedin.com/sales/company/86833445,https://www.linkedin.com/company/86833445,"Current NFL Quarterback, former Quarterback at...",...,3rd,https://media.licdn.com/dms/image/C5603AQFiLl9...,0,Chase Garbers,ACwAACX30V0B9aRh69OSxN1f5XMVTMssKxTskxY,https://www.linkedin.com/in/ACwAACX30V0B9aRh69...,True,True,https://www.linkedin.com/sales/search/people?c...,2023-11-16T00:11:10.158Z
2,https://www.linkedin.com/sales/lead/ACwAAAAKBd...,Sonali Sambhus,Sonali,Sambhus,Thrrive Ventures,Partner,91330754.0,https://www.linkedin.com/sales/company/91330754,https://www.linkedin.com/company/91330754,"I serve as a trusted advisor to boards, CEOs, ...",...,2nd,https://media.licdn.com/dms/image/D5603AQF-Zhk...,1,Sonali Sambhus,ACwAAAAKBdIB3WDR4C4_tlu_TUpozOz4eRma0C4,https://www.linkedin.com/in/ACwAAAAKBdIB3WDR4C...,False,False,https://www.linkedin.com/sales/search/people?c...,2023-11-16T00:11:10.158Z
3,https://www.linkedin.com/sales/lead/ACwAAAACCH...,David Benham,David,Benham,Mighty Capital,Limited Partner,24777720.0,https://www.linkedin.com/sales/company/24777720,https://www.linkedin.com/company/24777720,Now focusing on helping entrepreneurs succeed ...,...,3rd,https://media.licdn.com/dms/image/C5603AQEECTq...,0,David Benham,ACwAAAACCHcBE3Z0PWESudlInoUYNJSZGWjIQjY,https://www.linkedin.com/in/ACwAAAACCHcBE3Z0PW...,True,True,https://www.linkedin.com/sales/search/people?c...,2023-11-16T00:11:10.158Z
4,https://www.linkedin.com/sales/lead/ACwAAAAIZO...,Russell Deakin,Russell,Deakin,Aceana Group,CIO & Managing Partner,,,,Experience: An alternative asset executive wit...,...,2nd,https://media.licdn.com/dms/image/D4E03AQGy9rM...,5,Russell Deakin,ACwAAAAIZOgBOtFZkgHEfpMvlhF5gGlhCe_IDJw,https://www.linkedin.com/in/ACwAAAAIZOgBOtFZkg...,True,True,https://www.linkedin.com/sales/search/people?c...,2023-11-16T00:11:10.158Z


# Set up

In [65]:
# Batch size is how many data we send to the API per request.
# Batch endpoint can process up to 250 contacts with a single request.
# One contact data must be less than 10 kB.
#
# The API will cost 1 credit per data length.
batch_size = 10
# The column titles for first name, last name and company name in Given GSheet.
first_name_col = "firstName"
last_name_col = "lastName"
company_col = "companyName"
# API key of DropContact.
api_key = ""

# DropContact functions

In [79]:
def preprocess_data(
    first_name_list: Iterable[str],
    last_name_list: Iterable[str],
    company_list: Iterable[str],
) -> List:
    """
    Preprocess data for DropContact API.

    :param first_name_list: List of first names.
    :param last_name_list: List of last names.
    :param company_list: List of company names.
    :return: A list of dictionaries, each dictionary contains first name, last name and company name.
    """
    data = []
    # Check the input format.
    if not len(first_name_list) == len(last_name_list) == len(company_list):
        print("Error: length of input data must be the same.")
        return []
    # Format data for dropcontact API.
    for first_name, last_name, company in zip(
        first_name_list, last_name_list, company_list
    ):
        data.append(
            {
                "first_name": first_name,
                "last_name": last_name,
                "company": company,
            }
        )
    return data


def request_dropcontact(batch_data: List[dict], api_key: str) -> dict:
    """
    Send request to DropContact API.

    :param batch_data: List of dictionaries, each dictionary contains first name, last name and company name.
    :param api_key: API key of DropContact.
    :return: A dictionary contains the query result.
    """
    post_response = requests.post(
        "https://api.dropcontact.io/batch",
        json={
            "data": batch_data,
            "siren": True,
            "language": "en",
        },
        headers={
            "Content-Type": "application/json",
            "X-Access-Token": api_key,
        },
    ).json()
    return post_response


def generate_result_df(query_results: List[dict]) -> pd.DataFrame:
    """
    Generate dataframe from query result.

    :param query_results: List of query results.
    :return: A dataframe with columns: first name, last name, full name, email, phone, pronoun, job title.
    """
    result_list = []
    result_title = [
        "first name",
        "last name",
        "full name",
        "email",
        "phone",
        "pronoun",
        "job title",
    ]
    for result in query_results:
        first_name = ""
        last_name = ""
        full_name = ""
        email = ""
        phone = ""
        pronoun = ""
        job = ""
        if "first_name" in result:
            first_name = result["first_name"]
        if "last_name" in result:
            last_name = result["last_name"]
        if "full_name" in result:
            full_name = result["full_name"]
        if "email" in result:
            email = ";".join(map(lambda x: x["email"], result["email"]))
        if "phone" in result:
            phone += result["phone"]
        if "mobile_phone" in result:
            if phone:
                phone += ";"
            phone += result["mobile_phone"]
        if "civility" in result:
            pronoun = result["civility"]
        if "job" in result:
            job = result["job"]
        # Convert phone number to string.
        phone = str(phone)
        result_list.append(
            [first_name, last_name, full_name, email, phone, pronoun, job]
        )
    return pd.DataFrame(data=result_list, columns=result_title)


def send_batch_request(
    data: List[dict], api_key: str, batch_size: int
) -> List[dict]:
    """
    Send batch request to DropContact API.

    :param data: List of dictionaries, each dictionary contains first name, last name and company name.
    :param api_key: API key of DropContact.
    :param batch_size: Batch size.
    :return: A list of dictionaries, each dictionary contains the query result.
    """
    batches = []
    query_results = []
    data_length = len(data)
    # Splitting data into batches.
    for batch_i in range(ceil(data_length / batch_size)):
        batches.append(data[batch_i * batch_size : (batch_i + 1) * batch_size])
    # Executing query per batch.
    start_time = time.time()
    for batch_idx, batch_data in enumerate(
        tqdm(batches, desc="Processing batches", ascii=True, ncols=100)
    ):
        batch_start_time = time.time()
        print(f"Starting query batch {str(batch_idx)}.")
        batch_result = []
        # Send a search query.
        # This request will cost 1 credit per data length.
        post_response = request_dropcontact(batch_data, api_key)
        query_id = post_response["request_id"]
        print(f"Batch {str(batch_idx)}: Query ID: {str(query_id)}.")
        # Wait for query result, 11 seconds per attempt, 55 seconds timeout.
        for i in range(5):
            # Get query result using retrieved ID. This request won't cost any credit.
            get_response = requests.get(
                "https://api.dropcontact.io/batch/{}".format(query_id),
                headers={"X-Access-Token": api_key},
            ).json()
            query_finished = get_response["success"]
            if query_finished:
                batch_result = get_response["data"]
                credits_left = get_response["credits_left"]
                print(
                    f"Batch {str(batch_idx)}: Query finished. Credits left: {str(credits_left)}."
                )
                break
            else:
                reason = get_response["reason"]
            error = get_response["error"]
            if error:
                print(f"Error detected, reason: {str(reason)}.")
                break
            time.sleep(11)
        if not batch_result:
            print(f"Batch {str(batch_idx)}: Query failed, reason: timeout.")
            batch_result = [{}] * len(batch_data)
        query_results += batch_result
        batch_end_time = time.time()
        print(
            f"Batch {batch_idx} completed in {batch_end_time - batch_start_time:.2f} seconds."
        )
    # Total processing time.
    end_time = time.time()
    print(f"Total processing time: {end_time - start_time:.2f} seconds.")
    return query_results


def get_email_from_dropcontact(
    first_name_list: Iterable[str],
    last_name_list: Iterable[str],
    company_list: Iterable[str],
    api_key: str,
    batch_size: int,
) -> pd.DataFrame:
    """
    Get email from DropContact API using first name, last name and company
    name.

    :param first_name_list: List of first names.
    :param last_name_list: List of last names.
    :param company_list: List of company names.
    :param api_key: API key of DropContact.
    :return: A dataframe with columns: first name, last name, full name, email, phone, pronoun, job title.
    """
    data = preprocess_data(first_name_list, last_name_list, company_list)
    # Send batch request to DropContact API.
    query_results = send_batch_request(data, api_key, batch_size)
    # Generate dataframe from query result.
    result_df = generate_result_df(query_results)
    return result_df

# Get emails from DropContact

In [80]:
email_df = get_email_from_dropcontact(
    df[first_name_col], df[last_name_col], df[company_col], api_key, batch_size
)

Processing batches:   0%|                                                     | 0/1 [00:00<?, ?it/s]

Starting query batch 0.
Batch 0: Query ID: jrjqymatotutamw.


Processing batches: 100%|#############################################| 1/1 [00:46<00:00, 46.78s/it]

Batch 0: Query finished. Credits left: 3.
Batch 0 completed in 46.78 seconds.
Total processing time: 46.79 seconds.





In [95]:
email_df

Unnamed: 0,first name,last name,full name,email,phone,pronoun,job title
0,Johanan,Ottensooser,Johanan Ottensooser,johanan.ottensooser@p72.vc,,Mr,
1,Chase,Garbers,Chase Garbers,,,Mr,
2,Sonali,Sambhus,Sonali Sambhus,,,Mrs,Advisory Board Member
3,David,Benham,David Benham,david@mighty.net,,Mr,
4,Russell,Deakin,Russell Deakin,,,Mr,CIO & Managing Partner
5,Jeff,Bell,Jeff Bell,jbell@midoceanpartners.com,'+1 212-497-1407,Mr,
6,Jeff,Clavier,Jeff Clavier,jeff@uncorkcapital.com,'+1 650-688-1801,Mr,
7,Nat,Clarkson,Nat Clarkson,,,Mr,
8,Ollie,Howie,Ollie Howie,ollie@nextgenvp.com,'+20 160909,Mr,
9,Danish,M.,Danish M.,,'+64 7 477 8020,Mr,


# Write email_df to the same Google Sheet

In [93]:
# Fix phone number format.
def prepare_phone_number_for_sheets(phone_number):
    if phone_number != "":
        pattern = r'^'
        replacement = "'"
        return re.sub(pattern, replacement, phone_number)
    else:
        return phone_number

email_df['phone'] = email_df['phone'].apply(prepare_phone_number_for_sheets)

email_df

Unnamed: 0,first name,last name,full name,email,phone,pronoun,job title
0,Johanan,Ottensooser,Johanan Ottensooser,johanan.ottensooser@p72.vc,,Mr,
1,Chase,Garbers,Chase Garbers,,,Mr,
2,Sonali,Sambhus,Sonali Sambhus,,,Mrs,Advisory Board Member
3,David,Benham,David Benham,david@mighty.net,,Mr,
4,Russell,Deakin,Russell Deakin,,,Mr,CIO & Managing Partner
5,Jeff,Bell,Jeff Bell,jbell@midoceanpartners.com,'+1 212-497-1407,Mr,
6,Jeff,Clavier,Jeff Clavier,jeff@uncorkcapital.com,'+1 650-688-1801,Mr,
7,Nat,Clarkson,Nat Clarkson,,,Mr,
8,Ollie,Howie,Ollie Howie,ollie@nextgenvp.com,'+20 160909,Mr,
9,Danish,M.,Danish M.,,'+64 7 477 8020,Mr,


In [94]:
def df_to_gsheet(gsheet_name: str, df: pd.DataFrame) -> None:
    # Write to the sheet.
    # Make sure the sheet "email"(sheet_name) exists in the Google Sheet.
    sheet_name = "email"
    spread2 = gspread_pandas.Spread(
        gsheet_name, sheet=sheet_name, create_sheet=True, creds=creds
    )
    spread2.df_to_sheet(df, index=False)


#
df_to_gsheet(gsheet_name, email_df)