# Import

In [1]:
# Before running this notebook, please read the instruction here:
# https://gspread-pandas.readthedocs.io/en/latest/getting_started.html#client-credentials
# Follow the steps in `Client Credentials` until you have the JSON file downloaded. 
# Save that JSON as `client_secrets.json` and put it in `../config/` folder, then you are all set.

!sudo /bin/bash -c "(source /venv/bin/activate; pip install --upgrade google-auth google-auth-httplib2 google-auth-oauthlib google-api-python-client)"
!sudo /bin/bash -c "(source /venv/bin/activate; pip install gspread-pandas)"

Collecting google-auth
  Downloading google_auth-2.23.0-py2.py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 14.0 MB/s eta 0:00:01
[?25hCollecting google-auth-httplib2
  Downloading google_auth_httplib2-0.1.1-py2.py3-none-any.whl (9.3 kB)
Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-1.1.0-py2.py3-none-any.whl (19 kB)
Collecting google-api-python-client
  Downloading google_api_python_client-2.100.0-py2.py3-none-any.whl (12.2 MB)
[K     |████████████████████████████████| 12.2 MB 16.7 MB/s eta 0:00:01
Collecting httplib2>=0.19.0
  Downloading httplib2-0.22.0-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 7.2 MB/s  eta 0:00:01
Collecting uritemplate<5,>=3.0.1
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5
  Downloading google_api_core-2.11.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 20.6

In [2]:
import logging
import helpers.hdbg as hdbg
import helpers.hio as hio
import linkedin.phantom_api.phantombuster_api as lpphapia
import helpers.hgoogle_file_api as hgofiapi

In [3]:
_LOG = logging.getLogger(__name__)
hdbg.init_logger(use_exec_path=True)

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-c875ff4e-029b-43cc-9405-3eaba98b733e.json'
[36mINFO[0m: Saving log to file '/app/linkedin/notebooks/none.log'


# Initial

In [4]:
phantom = lpphapia.Phantom()

# Input

In [5]:
# (INPUT)Set the search name, it will also be the folder name, 
# or set it as '' to create files in your Google Drive root folder.
search_name = "sn_search5_test"

In [6]:
# (INPUT)Set the parent folder: your new folder will be created in this folder.
# "1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R" is linkedin_data folder id.
# In the URL address: https://drive.google.com/drive/u/0/folders/1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R
# 1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R is folder id.
parent_folder_id = "1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R"

In [7]:
# Set gsheets name.
gsheets_name = [
    f"{search_name}.step1.search_export",
    f"{search_name}.step2.search_export_filtered",
    f"{search_name}.step3.profile_export",
    f"{search_name}.step3.search_export_filtered",
]

In [8]:
# Get all phantoms and their phantom id.
phantom.get_all_phantoms()

Unnamed: 0,id,name,scriptId,lastEndMessage,lastEndStatus,queuedContainers,runningContainers
0,4074665361228041,Yiyun Emails Sales Navigator Profile Scraper,11108,,success,0,0
1,2862499141527492,Yiyun Search5 Sales Navigator Search Export,6988,,success,0,0
2,6986446216685907,Yiyun emails LinkedIn Profile Scraper,3112,,success,0,0
3,3593602419926765,Yiyun LinkedIn Profile Scraper,3112,,success,0,0
4,3933308360008191,GP LinkedIn Profile Scraper,3112,,success,0,0


In [9]:
# (INPUT) Set the phantom IDs (Choose ID from the above table).
search_phantom_id = "2862499141527492"
profile_phantom_id = "3593602419926765"

In [10]:
# Path to save result csv.
result_dir = "../result_csv/"
search_result_csv_path = result_dir + f"{search_name}_search_result.csv"
profile_result_csv_path = result_dir + f"{search_name}_profile_result.csv"

# Create the empty Google Drive folder and Google sheets

In [11]:
# Create a folder with search_name in the dir parent folder.
current_folder_id = hgofiapi.create_google_drive_folder(search_name, parent_folder_id)

INFO  /app/helpers/.google_credentials/client_secrets.json
INFO  Created a new Google Drive folder 'sn_search5_test'.
INFO  The new folder id is '1Juy0L6ZdD0uXmY8Dmf6HkICHeAifD2Ep'.


In [12]:
# Create empty gsheets in the new created folder.
for gsheet_name in gsheets_name:
    hgofiapi.create_empty_google_file(
        gfile_type = "sheet",
        gfile_name = gsheet_name,
        gdrive_folder_id = current_folder_id,
        user = ""
    )

INFO  Created a new Google sheet 'sn_search5_test.step1.search_export'.
INFO  Created a new Google sheet 'sn_search5_test.step2.search_export_filtered'.
INFO  Created a new Google sheet 'sn_search5_test.step3.profile_export'.
INFO  Created a new Google sheet 'sn_search5_test.step3.search_export_filtered'.


# Download result CSVs to local storage

In [13]:
# Download search result csv.
phantom.download_result_csv_by_phantom_id(search_phantom_id, search_result_csv_path)

INFO  Result CSV URL: https://phantombuster.s3.amazonaws.com/jqWbRHyznhM/pyjlATELCNt5qJDusZToQg/result.csv
INFO  Result CSV saved to ../result_csv/sn_search5_test_search_result.csv


In [14]:
# Download profile result csv.
phantom.download_result_csv_by_phantom_id(profile_phantom_id, profile_result_csv_path)

INFO  Result CSV URL: https://phantombuster.s3.amazonaws.com/jqWbRHyznhM/EAS5IvYVZQiP2OFtv7KqjQ/result.csv
INFO  Result CSV saved to ../result_csv/sn_search5_test_profile_result.csv


# Upload result CSVs to Google sheets

In [15]:
import gspread_pandas
import pandas as pd

In [16]:
search_export_df = pd.read_csv(search_result_csv_path)
profile_export_df = pd.read_csv(profile_result_csv_path)

In [17]:
search_export_df.head()

Unnamed: 0,query,timestamp,error,profileUrl,fullName,firstName,lastName,companyName,title,companyId,...,vmid,linkedInProfileUrl,isPremium,isOpenLink,titleDescription,pastExperienceCompanyName,pastExperienceCompanyUrl,pastExperienceCompanyTitle,pastExperienceDate,pastExperienceDuration
0,Search5 SN,2023-07-19T18:00:39.744Z,No result found,,,,,,,,...,,,,,,,,,,
1,https://www.linkedin.com/sales/search/people?s...,2023-07-19T18:02:26.834Z,,https://www.linkedin.com/sales/lead/ACwAABgNPU...,Dr. Edgar Noumair,Dr. Edgar,Noumair,Talal and Madiha Zein AUB Innovation Park,Mentor,71418447.0,...,ACwAABgNPUkBxBAF3JxK0VPQmHkYRnrxXzAiHjY,https://www.linkedin.com/in/ACwAABgNPUkBxBAF3J...,False,False,,,,,,
2,https://www.linkedin.com/sales/search/people?s...,2023-07-19T18:02:26.834Z,,https://www.linkedin.com/sales/lead/ACwAAAHVj4...,Maureen Cusick Thomas,Maureen,Cusick Thomas,Bethesda Green,Innovation Lab Mentor,9287319.0,...,ACwAAAHVj44Brlumxgp-o7YP0oARKp_kKER4e6k,https://www.linkedin.com/in/ACwAAAHVj44Brlumxg...,True,True,The Innovation Lab Mentor Program is designed ...,,,,,
3,https://www.linkedin.com/sales/search/people?s...,2023-07-19T18:02:26.835Z,,https://www.linkedin.com/sales/lead/ACwAAAE1Yq...,Neil Davis,Neil,Davis,Innovation Works,Business Mentor,11787752.0,...,ACwAAAE1YqQBZoy53iiEXT0Ol-GrNh_0U0dhvqM,https://www.linkedin.com/in/ACwAAAE1YqQBZoy53i...,False,False,Innovation Works aims to reduce Baltimore’s ne...,,,,,
4,https://www.linkedin.com/sales/search/people?s...,2023-07-19T18:02:26.835Z,,https://www.linkedin.com/sales/lead/ACwAAACRHS...,Michael Hess,Michael,Hess,"ABS Advisory Services Group, Inc.",Chairman/CEO,,...,ACwAAACRHSQB6TCtuIxEce2hAb9lV3NM_GzNPc8,https://www.linkedin.com/in/ACwAAACRHSQB6TCtuI...,False,False,"Manage mergers & acquisitions, corporate and r...",,,,,


In [18]:
profile_export_df.head()

Unnamed: 0,error,baseUrl,timestamp,linkedinProfileUrl,email,linkedinProfile,description,headline,location,imgUrl,...,mail,companyUrl,companyUrl2,schoolUrl2,website,schoolDateRange,schoolDateRange2,birthday,facebookUrl,connectedOn
0,Not a LinkedIn Profile URL,profileUrl,2023-07-19T19:21:54.310Z,,,,,,,,...,,,,,,,,,,
1,,https://www.linkedin.com/sales/lead/ACwAAAHVj4...,2023-07-19T19:22:11.128Z,https://www.linkedin.com/in/maureenbcthomas/,,https://www.linkedin.com/in/maureenbcthomas/,"A visionary strategist, entrepreneur and innov...",Stakeholder Engagement I Innovative Coalition ...,Washington DC-Baltimore Area,https://media.licdn.com/dms/image/C5603AQGVCII...,...,,,,,,,,,,
2,,https://www.linkedin.com/sales/lead/ACwAABgNPU...,2023-07-19T19:22:32.106Z,https://www.linkedin.com/in/edgarnoumair/,ednoumair@hotmail.com,https://www.linkedin.com/in/edgarnoumair/,I Advise & Lead Boards & Organizations on Acti...,Data Scientist / Entrepreneur / Author / Artis...,Washington DC-Baltimore Area,https://media.licdn.com/dms/image/C5603AQFNGxE...,...,ednoumair@hotmail.com,https://www.linkedin.com/company/10522559/,https://www.linkedin.com/company/4794/,https://www.linkedin.com/company/3165/,gnoci.com,,,,,
3,,https://www.linkedin.com/sales/lead/ACwAAAE1Yq...,2023-07-19T19:22:51.105Z,https://www.linkedin.com/in/nrdavis/,,https://www.linkedin.com/in/nrdavis/,,Assisting Maryland's startup companies and ent...,"Towson, Maryland, United States",https://media.licdn.com/dms/image/D5603AQGPI--...,...,,https://www.linkedin.com/company/5337426/,https://www.linkedin.com/company/1734721/,https://www.linkedin.com/company/19308/,,1979 - 1982,1971 - 1975,,,
4,,https://www.linkedin.com/sales/lead/ACwAAACRHS...,2023-07-19T19:23:08.967Z,https://www.linkedin.com/in/discoveringmind/,,https://www.linkedin.com/in/discoveringmind/,I help corporate and real estate teams plan an...,"Finance/M&A Entrepreneur, Director & Advisor. ...","Annapolis, Maryland, United States",https://media.licdn.com/dms/image/C4D03AQF-DLN...,...,,,https://www.linkedin.com/company/59839/,https://www.linkedin.com/company/4477/,,1986 - 1988,Jul 2021 - Aug 2021,,,


In [19]:
def df_to_gsheet(gsheet_name: str, df: pd.DataFrame) -> None:
    creds = hgofiapi.get_credentials()
    gsheet = gspread_pandas.Spread(
        gsheet_name,
        create_sheet=True,
        creds=creds
    )
    gsheet.df_to_sheet(df, index=False)
    _LOG.info("Save to gsheet %s", gsheet_name)

In [20]:
df_to_gsheet(f"{search_name}.step1.search_export", search_export_df)
df_to_gsheet(f"{search_name}.step3.profile_export", profile_export_df)

INFO  Save to gsheet sn_search5_test.step1.search_export
INFO  Save to gsheet sn_search5_test.step3.profile_export


# Delete temp result CSVs

In [21]:
hio.delete_file(search_result_csv_path)
_LOG.info("Delete file %s", search_result_csv_path)

INFO  Delete file ../result_csv/sn_search5_test_search_result.csv


In [22]:
hio.delete_file(profile_result_csv_path)
_LOG.info("Delete file %s", profile_result_csv_path)

INFO  Delete file ../result_csv/sn_search5_test_profile_result.csv
