In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import time
import os
import re as re
import pandas as pd
import sys

In [None]:
#updating local webdriver to PATH and starting driver
try:
    os.environ["PATH"] += r"C:/Users/yuvay/Documents/Internships/UniCult/chromedriver-win64"
    driver = webdriver.Chrome()
except:
    print("Location not found or chromedriver not present please verify driver path and try again")
    driver.quit()
    sys.exit("quitting")

In [None]:
#login details
usr = input("enter linkedin email id:")
passw = input("enter linkedin password")
try:
    driver.get("https://www.linkedin.com/uas/login")
    driver.implicitly_wait(5)

    username = driver.find_element(By.ID, "username")
    username.send_keys(usr)

    psw = driver.find_element(By.ID, "password")
    psw.send_keys(passw)

    driver.find_element(By.XPATH, "//button[@type='submit']").click()
except:
    print("error logging in")
    driver.quit()
    sys.exit("quitting")

In [None]:
#scrapping code
def scrape_posts(link, post_texts, post_names, num_posts):
    #extracting name from the link
    name = link.split('/')[-2]
    print("Fetching data from account:", name)

    try:
        #loading link
        driver.get(link + 'detail/recent-activity/shares/')  
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "occludable-update")))

        #scroll down to load all content
        start = time.time()
        initialScroll = 0
        finalScroll = 1000
        while True:
            driver.execute_script(f"window.scrollTo({initialScroll}, {finalScroll})")
            # finalScroll variable
            initialScroll = finalScroll
            finalScroll += 1000
            #wait time for data to load increase if internet is slower
            time.sleep(1)
            end = time.time()
            if round(end - start) > 100:
                break

        #beautifulsoup to parse the page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        containers = soup.find_all("div", {"class": "occludable-update"})

        #extracting posts
        for i, container in enumerate(containers, 1):
            if i > num_posts:
                break
            text_box = container.find("div", {"class": "feed-shared-update-v2__description-wrapper"})
            text = text_box.find("span", {"dir": "ltr"}).text.strip() if text_box else ""
            post_texts.append(text)
            post_names.append(name)

        print("posts fetched:", len(post_texts))
    except Exception as e:
        print(f"An error occurred while scraping {link}: {e}")
    finally:
        driver.quit()

In [None]:
#displaying posts
n = int(input("enter the number of entries: "))
post_links = [input("enter the link: ") for _ in range(n)]
post_texts = []
post_names = []

post_count = int(input("enter number of posts per account: "))
for link in post_links:
    scrape_posts(link, post_texts, post_names, post_count)

#printing scraped data
for text, name in zip(post_texts, post_names):
    print(f"Name: {name}\nPost: {text}\n\n")

In [None]:
#lsa initialization for data clustering
#initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

#vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize,
                        token_pattern = None)

#fit and transform the documents
train_data = tfidf.fit_transform(post_texts)

#define the number of topics or components
num_components=3

#create SVD object
lsa = TruncatedSVD(n_components = num_components, n_iter = 100, random_state = 42)

#fit SVD model on data
lsa.fit_transform(train_data)

#getting singular values and components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

#displaying identified topics
terms = tfidf.get_feature_names_out()

topic_keywords = []

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key = sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    top_terms_list = list(dict(top_terms_key).keys())
    topic_keywords.append(top_terms_list)
    print("topic "+str(index)+": ",top_terms_list)

In [None]:
#calculating best fitting topic and displaying it
cluster_topic = []
for post in post_texts:
    post_split = post.split(' ')
    topic_match_score = []
    count = 0
    for topic in topic_keywords:
        for word in post_split:
            if word in topic:
                count += 1
        topic_match_score.append(count)
        count = 0
    cluster_topic.append(topic_match_score.index(max(topic_match_score)))

for post_index in range(len(post_texts)):
    print(f"post:{post_index+1}\ntopic {cluster_topic[post_index]}: {topic_keywords[cluster_topic[post_index]]}")