# Import

In [1]:
# Before running this notebook, please read the instruction here:
# https://gspread-pandas.readthedocs.io/en/latest/getting_started.html#client-credentials
# Follow the steps in `Client Credentials` until you have the JSON file downloaded. 
# Save that JSON as `service.json` and put it in `helpers/.google_credentials` folder, then you are all set.
!sudo /bin/bash -c "(source /venv/bin/activate; pip install --upgrade google-api-python-client)"

Collecting google-api-python-client
  Downloading google_api_python_client-2.106.0-py2.py3-none-any.whl (12.6 MB)
[K     |████████████████████████████████| 12.6 MB 6.1 MB/s eta 0:00:01     |█████████████████████████████▏  | 11.5 MB 6.1 MB/s eta 0:00:01
[?25hCollecting httplib2<1.dev0,>=0.15.0
  Downloading httplib2-0.22.0-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 3.9 MB/s  eta 0:00:01
[?25hCollecting google-auth-httplib2>=0.1.0
  Downloading google_auth_httplib2-0.1.1-py2.py3-none-any.whl (9.3 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5
  Downloading google_api_core-2.12.0-py3-none-any.whl (121 kB)
[K     |████████████████████████████████| 121 kB 24.8 MB/s eta 0:00:01
[?25hCollecting uritemplate<5,>=3.0.1
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)
Collecting protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5
  Downloading protobuf-4.24.4-cp37-a

In [2]:
import logging
import helpers.hdbg as hdbg
import helpers.hio as hio
import linkedin.phantom_api.phantombuster_api as lpphapia
import helpers.hgoogle_file_api as hgofiapi

In [3]:
_LOG = logging.getLogger(__name__)
hdbg.init_logger(use_exec_path=True)

[0m[36mINFO[0m: > cmd='/venv/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-c0dca753-1fa1-4f42-a930-ccdb663a25c0.json'
[36mINFO[0m: Saving log to file '/app/linkedin/notebooks/none.log'


# Initial

In [4]:
phantom = lpphapia.Phantom()

# Input

In [5]:
# (INPUT)Set the search name, it will also be the folder name, 
# or set it as '' to create files in your Google Drive root folder.
search_name = "sn_search5_test"

In [6]:
# (INPUT)Set the parent folder: your new folder will be created in this folder.
# "1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R" is linkedin_data folder id.
# In the URL address: https://drive.google.com/drive/u/0/folders/1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R
# 1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R is folder id.
parent_folder_id = "1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R"

In [7]:
# Set gsheets name.
gsheets_name = [
    f"{search_name}.step1.search_export",
    f"{search_name}.step2.search_export_filtered",
    f"{search_name}.step3.profile_export",
    f"{search_name}.step3.search_export_filtered",
]

In [8]:
# Get all phantoms and their phantom id.
phantom.get_all_phantoms()

Unnamed: 0,id,name,scriptId,lastEndMessage,lastEndStatus,queuedContainers,runningContainers
0,3577001783753488,Search1.Quants_from GP_LIn LinkedIn Profile Sc...,3112,,success,0,0
1,5767607159170031,Search3.Fintech_VC_in_Washington_area.SalesNav...,2350589230697394,,success,0,0
2,5931245031101557,LinkedIn Auto Connect,2818,,success,0,0
3,2654119293639176,Sales Navigator Search Export,6988,,success,0,0
4,5950605628456845,LinkedIn Connections Export,12670,,,0,0


In [9]:
# (INPUT) Set the phantom IDs (Choose ID from the above table).
# search_phantom_id = "2862499141527492"
# profile_phantom_id = "3593602419926765"
search_phantom_id = "3577001783753488"
profile_phantom_id = "5767607159170031"

In [10]:
# Path to save result csv.
result_dir = "../result_csv/"
search_result_csv_path = result_dir + f"{search_name}_search_result.csv"
profile_result_csv_path = result_dir + f"{search_name}_profile_result.csv"

# Create the empty Google Drive folder and Google sheets

In [11]:
# Create a folder with search_name in the dir parent folder.
current_folder_id = hgofiapi.create_google_drive_folder(search_name, parent_folder_id)

ERROR <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files?fields=id&alt=json returned "File not found: 1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R.". Details: "[{'message': 'File not found: 1dQ9e-bNKkXwNvobQyRFbPwgEh1-VSf4R.', 'domain': 'global', 'reason': 'notFound', 'location': 'fileId', 'locationType': 'parameter'}]">


In [12]:
# Create empty gsheets in the new created folder.
for gsheet_name in gsheets_name:
    hgofiapi.create_empty_google_file(
        gfile_type = "sheet",
        gfile_name = gsheet_name,
        gdrive_folder_id = current_folder_id,
        user = ""
    )

INFO  Created a new Google sheet 'sn_search5_test.step1.search_export'.
INFO  Created a new Google sheet 'sn_search5_test.step2.search_export_filtered'.
INFO  Created a new Google sheet 'sn_search5_test.step3.profile_export'.
INFO  Created a new Google sheet 'sn_search5_test.step3.search_export_filtered'.


# Download result CSVs to local storage

In [13]:
# Download search result csv.
phantom.download_result_csv_by_phantom_id(search_phantom_id, search_result_csv_path)

INFO  Result CSV URL: https://phantombuster.s3.amazonaws.com/jqWbRHyznhM/zaWD2c5JakmaeHfNukpY1g/result.csv
INFO  Result CSV saved to ../result_csv/sn_search5_test_search_result.csv


In [14]:
# Download profile result csv.
phantom.download_result_csv_by_phantom_id(profile_phantom_id, profile_result_csv_path)

INFO  Result CSV URL: https://phantombuster.s3.amazonaws.com/jqWbRHyznhM/NK9VSKTfi2EMx2eUXcrfTg/chart.csv
INFO  Result CSV saved to ../result_csv/sn_search5_test_profile_result.csv


# Upload result CSVs to Google sheets

In [15]:
import gspread_pandas
import pandas as pd

In [16]:
search_export_df = pd.read_csv(search_result_csv_path)
profile_export_df = pd.read_csv(profile_result_csv_path)

In [17]:
search_export_df.head()

Unnamed: 0,error,baseUrl,timestamp,linkedinProfileUrl,email,linkedinProfile,headline,location,imgUrl,firstName,...,websiteFromDropContact,companyWebsite,jobDescription,schoolDescription,phoneNumber,description,schoolDescription2,twitter,twitterProfileUrl,birthday
0,Not a LinkedIn Profile URL,profileUrl,2023-10-31T10:43:32.852Z,,,,,,,,...,,,,,,,,,,
1,,https://www.linkedin.com/in/dsweet99,2023-10-31T10:51:16.966Z,https://www.linkedin.com/in/dsweet99/,,https://www.linkedin.com/in/dsweet99/,Experimenting engineer,"New York, New York, United States",https://media.licdn.com/dms/image/D4E03AQHu11c...,David,...,,,,,,,,,,
2,,https://www.linkedin.com/in/andrey-grinshpun-7...,2023-10-31T10:51:57.753Z,https://www.linkedin.com/in/andrey-grinshpun-7...,agrinshp@gmail.com,https://www.linkedin.com/in/andrey-grinshpun-7...,Math Ph.D. working in Buy-Side Quantitative Fi...,"Austin, Texas, United States",,Andrey,...,www.citadelsecurities.com,https://www.citadelsecurities.com,,,,,,,,
3,,https://www.linkedin.com/in/sophia-x-liu-60177427,2023-10-31T10:52:49.935Z,https://www.linkedin.com/in/sophia-x-liu-60177...,soph.liu@gmail.com,https://www.linkedin.com/in/sophia-x-liu-60177...,Portfolio Manager,"New York, New York, United States",https://media.licdn.com/dms/image/C4D03AQGxrCw...,Sophia X,...,www.point72.com,http://www.point72.com,I trade RV and global macro across major asset...,Concentration: Applied Probability and Statist...,,,,,,
4,,https://www.linkedin.com/in/akhuraskin,2023-10-31T10:53:55.780Z,https://www.linkedin.com/in/akhuraskin/,askhuraskin@gmail.com,https://www.linkedin.com/in/akhuraskin/,"Machine Learning Engineer, Autonomous Vehicles","Palo Alto, California, United States",https://media.licdn.com/dms/image/C4E03AQE3u6b...,Alexey,...,www.woven.toyota,https://woven.toyota/,,,+1(347)268-3613,,,,,


In [18]:
profile_export_df.head()

Unnamed: 0,searchResultsTotal,searchResults,invitationSentTotal,invitationSent,requestAcceptedTotal,requestAccepted,notInvited,notInvitedTotal,timestamp
0,0,0,0,0,0,0,0,0,2023-10-31T18:19:14.992Z
1,30,30,0,0,0,0,0,0,2023-10-31T18:21:42.682Z
2,60,30,5,5,0,0,0,0,2023-10-31T18:57:42.601Z
3,90,30,5,0,0,0,0,0,2023-10-31T19:56:38.972Z
4,90,0,10,5,0,0,0,0,2023-10-31T19:57:41.397Z


In [19]:
def df_to_gsheet(gsheet_name: str, df: pd.DataFrame) -> None:
    creds = hgofiapi.get_credentials()
    gsheet = gspread_pandas.Spread(
        gsheet_name,
        create_sheet=True,
        creds=creds
    )
    gsheet.df_to_sheet(df, index=False)
    _LOG.info("Save to gsheet %s", gsheet_name)

In [20]:
df_to_gsheet(f"{search_name}.step1.search_export", search_export_df)
df_to_gsheet(f"{search_name}.step3.profile_export", profile_export_df)

INFO  Save to gsheet sn_search5_test.step1.search_export
INFO  Save to gsheet sn_search5_test.step3.profile_export


# Delete temp result CSVs

In [21]:
hio.delete_file(search_result_csv_path)
_LOG.info("Delete file %s", search_result_csv_path)

INFO  Delete file ../result_csv/sn_search5_test_search_result.csv


In [22]:
hio.delete_file(profile_result_csv_path)
_LOG.info("Delete file %s", profile_result_csv_path)

INFO  Delete file ../result_csv/sn_search5_test_profile_result.csv
