# FIND USER'S POSTS

#### Import necessary libraries

In [1]:
# for scraping 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from bs4 import BeautifulSoup as bs

# misc
import re as re # regex 
import time
import pandas as pd

# accesing env file 
import os 
from dotenv import load_dotenv # to access the secret keys we've hidden in a separate file 
load_dotenv() # grab values inside env file

True

#### Selenium and LinkedIn Setup

In [2]:
PATH = os.getenv("WEBDRIVER_PATH")
USERNAME = os.getenv("LI_USERNAME")
PASSWORD = os.getenv("LI_PASS")


In [3]:
# initialize web driver that would control the web browser
ser = Service(PATH)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)

# website we wanted to access 
driver.get("https://www.linkedin.com/uas/login")
time.sleep(3) # added a pause to avoid getting marked as bot 

# login in linkedin
email=driver.find_element(By.ID,"username")
email.send_keys(USERNAME)
password=driver.find_element(By.ID,"password")
password.send_keys(PASSWORD)
time.sleep(3)
password.send_keys(Keys.RETURN)

#### Web Scraping Process

In [4]:
# Creating lists
account_links = ["https://www.linkedin.com/in/dkjapan/","https://www.linkedin.com/in/vidsrinivasan/"] # sample linkedin profiles
post_texts = []
post_names = []

def user_post_scrape(account_link):
    name = account_link[28:-1]
    time.sleep(10)

    driver.get(account_link + 'detail/recent-activity/shares/')  
    start=time.time()
    lastHeight = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
            break
        lastHeight = newHeight
        end=time.time()
        if round(end-start)>20: # scrolling thru website - pretending we're not a bot for 20 seconds 
            break

    company_page = driver.page_source   

    linkedin_soup = bs(company_page.encode("utf-8"), "html")
    linkedin_soup.prettify()
    containers = linkedin_soup.findAll("div",{"class":"occludable-update ember-view"})
    print("Fetching data from account: "+ name)
    
    iterations = 0
    nos = 1 # enter number of posts
    for container in containers:
        try:
            text_box = container.find("div",{"class":"feed-shared-update-v2__commentary"})
            text = text_box.find("span",{"dir":"ltr"})
            post_texts.append(text.get_text())
            post_names.append(name)
            iterations += 1
            
            if(iterations==nos):
                break

        except:
            print("There is an error fetching user's post. Initiating exit.")
            driver.quit()

            

#### Execute scraping

In [5]:
# Using loop to execute scraping 
n = int(len(account_links))
for j in range(n):
    user_post_scrape(account_links[j])

# Quit 
driver.quit()

Fetching data from account: dkjapan
Fetching data from account: vidsrinivasan


#### Storing scraped data into specified file format

In [6]:
data = {
    "Usernames": post_names,
    "Posts": post_texts,
}

# save to csv 
df = pd.DataFrame(data)
df.head(5)
df.to_csv("user_posts.csv", encoding='utf-8', index=False)

# save to excel 
# writer = pd.ExcelWriter("user_posts.xlsx", engine='xlsxwriter')
# df.to_excel(writer, index =False)
# writer.save()