### Installing the Required Libraries

In [1]:
%pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting requests (from webdriver_manager)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting charset-normalizer<4,>=2 (from requests->webdriver_manager)
  Using cached charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl.metadata (34 kB)
Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Using cached requests-2.31.0-py3-none-any.whl (62 kB)
Using cached charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl (100 kB)
Installing collected packages: python-dotenv, charset-normalizer, requests, webdriver_manager
Successfully installed charset-normalizer-3.3.2 python-dotenv-1.0.1 requests-2.31.0 webdriver_manager-4.0.1
Note: you may need to restart the kernel to use updated packages.

___

## Main Script

#### 1. Importing the required libraries

In [5]:
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

# For Microsoft Edge browser --------------------------------
option = webdriver.EdgeOptions()

# Chrome Options --------------------------------
# from webdriver_manager.chrome import ChromeDriverManager
# driver = webdriver.Chrome(ChromeDriverManager().install())

- `The output of the kernel below will give the option to which the driver is installed`

In [6]:
option

<selenium.webdriver.edge.options.Options at 0x1b5f21a99a0>

#### 2. Setting up the driver 

In [7]:
driver = webdriver.Edge(options=option)

- `driver.get method() will navigate to a page given by the URL address`

In [8]:
URL = "https://www.linkedin.com/posts/jyoti-bhasin-7840991ba_resources-sop-cv-activity-7145768597116641281-lL2O/?utm_source=share&utm_medium=member_desktop"
driver.get(URL)

- `This will search for the load more comments button and then will click on it.`

In [37]:
try:
    for i in range(10000):
        button = driver.find_element(by=By.CLASS_NAME, value="comments-comments-list__load-more-comments-button")
        driver.implicitly_wait(3)
        button.click()   
except Exception as e:
    print('The button is not found')


The button is not found


#### 3. Fetching the components from the comments section of the post

Look for the different elements in the webpage which are,
- Email
- Name 
- Headline
- Number of comments 

*Class Names of different elements,-*

- emails: `comments-comment-item-content-body` 

- comments: `social-details-social-counts__comments`

- name: `comments-post-meta__name`

- headline: `comments-post-meta__headline`

In [26]:
emails = driver.find_elements(value='comments-comment-item-content-body', by=By.CLASS_NAME)
comments = driver.find_element(value='social-details-social-counts__comments', by=By.CLASS_NAME)
name = driver.find_elements(value="comments-post-meta__name", by=By.CLASS_NAME)
headline = driver.find_elements(by=By.CLASS_NAME,value='comments-post-meta__headline')

### 4. Checking if the data has been correctly fetched

In [27]:
emails[1].text

'atharvkumarsingh_bt21b2_13@dtu.ac.in'

In [28]:
comments.text

'536 comments'

In [29]:
(name[0].text).split('\n')[0]

'Krishan Walia'

In [30]:
headline[0].text

'Experienced in ML and AI | Full Stack Developer | Quantum Computing Enthusiast | Writer | Research Scholar | Working on a Utility Patent'

#### 5. Extracting emails from the selenium driver object

In [31]:
no_of_comments = int((comments.text).split(' ')[0])
no_of_comments

536

In [32]:
no_of_comments = len(emails)
for i in range(no_of_comments):
    # print(re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", emails[i].text)[0])
    print(re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", emails[i].text))

['krishanw30@gmail.com']
['atharvkumarsingh_bt21b2_13@dtu.ac.in']
['bovascherian@gmail.com']
['aviral.s@iitg.ac.in']
['dhruvkhare148@gmail.com']
['piero.ong11@gmail.com']
['aaryark19@gmail.com']
['11n44sourjeshmukherjee@gmail.com']
['trgtrong17@gmail.com']
['term101112@gmail.com']
['rutajit.dey@gmail.com']
['aaryan2004kumar@gmail.com']
['23cheds.bhat@ug.ictmumbai.edu.in']
['rathod.6@iitj.ac.in']
['muskankadian263@gmail.com']
['themahmed19651@outlook.com']
['roomofambition@gmail.com']
['aanshsamyani123@gmail.com']
['khamesra@wisc.edu']
['anish2023.tubai@gmail.com']
['mrflame5883@gmail.com']
['Ayushman22128@iiitd.ac.in']
['Siddhisharma4549@gmail.com']
['dassoumyadeep26@gmail.com']
[]
['sukriti.bansal17@gmail.com']
['manas.chopra.2022@msit.in']
['muskanag821@gmail.com']
['easwaramoorthysriram43815@gmail.com']
['frahman2330149@bsds.uiu.ac.bd']
['ushsenush@gmail.com']
['styshri10@gmail.com']
['prem.rathod.2715@gmail.com']
[]
['aniketnayak2u@gmail.com']
['722211017013@andhrauniversity.edu.in

#### 6. Appending the extracted data to different lists

In [33]:
nameList = []
emailList = []
headlineList = []


for i in range(no_of_comments):
    
    try:
    
        emailList.append(re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", emails[i].text)[0])

        nameList.append((name[i].text).split("\n")[0])
    
        headlineList.append(headline[i].text)
    
    except:
        pass



#### 7. Converting the list to pd.DataFrame 

In [34]:
# intialise data of lists.
data = {'Name':nameList,
        'E-mails': emailList,
        'Headline':headlineList}

# Create DataFrame
df = pd.DataFrame(data)

#### 8. Overview of the DataFrame

In [35]:
df

Unnamed: 0,Name,E-mails,Headline
0,Krishan Walia,krishanw30@gmail.com,Experienced in ML and AI | Full Stack Develope...
1,Atharv Kumar Singh,atharvkumarsingh_bt21b2_13@dtu.ac.in,Data Analyst | Web Developer | Volunteer Leade...
2,Bovas Abraham,bovascherian@gmail.com,Student at St. Berchmans College
3,Aviral Srivastava,aviral.s@iitg.ac.in,"Student at Indian Institute of Technology, Guw..."
4,Dhruv Khare,dhruvkhare148@gmail.com,CS Major at NSUT'Delhi | Entrepreneurship and ...
...,...,...,...
514,AABID IBRAHIM S,srinjoydev26@gmail.com,--
515,DHRUV PREMANI,mail2umarss@gmail.com,Economics Graduate
516,Ehraz Ashraf,pranjulnigam15@gmail.com,Developing with React
517,Ishita Moitra,rahulkushwaha2506@gmail.com,Student at MIT ADT University


__NOTE__: Kindly don't use this data for something illegal.

#### 9. Saving the .csv file of the DataFrame

In [36]:
df.to_csv('INTERNATIONAL_RESEARCH_LIST.csv', mode='w')