In [None]:
from playwright.sync_api import sync_playwright
from playwright_stealth import stealth_sync
import time
import random
import pandas as pd
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Read the CSV file into a DataFrame
accounts_df = pd.read_csv('../reference/accounts.csv')

# Filter the DataFrame to include only active accounts
active_accounts = accounts_df[(accounts_df['email_acc_status'].isnull()) & (accounts_df['acc_status'].isnull()) & (accounts_df['user_agent'].notnull())]

# Select a random row from the DataFrame
random_account = active_accounts.sample(1).iloc[0]

bot_username = random_account['username']
user_agent = random_account['user_agent']
bot_password = random_account['password']

In [None]:
from playwright.sync_api import sync_playwright

def setup(user_agent):
    p = sync_playwright().start()
    browser = p.chromium.launch(headless=False)
    context = browser.new_context(user_agent=user_agent)
    page = context.new_page()
    stealth_sync(page)
    return p, browser, page

p, browser, page = setup(user_agent)

In [None]:
def process_accounts(page):
    # Load the accounts data
    accounts_df = pd.read_csv('../reference/accounts.csv')

    # Loop through the accounts
    for index, account in accounts_df.iterrows():
        # Check if account status is NaN
        if pd.isnull(account['email_acc_status']):
            # Navigate to ProtonMail
            page.goto('https://account.proton.me/mail/signup')
            time.sleep(3+3*random.random())

            # Fill in the 'email' field
            username_prefix = account['username'].split('@')[0]
            page.fill('#email', username_prefix)

            # Fill in the 'password' and 'repeat-password' fields
            page.fill('#password', account['password'])
            time.sleep(3+2*random.random())
            page.fill('#repeat-password', account['password'])
            time.sleep(2*random.random())
            # Submit form
            page.click('button[type="submit"]')

            # Prompt to continue to next iteration
            input("Press Enter to continue to next iteration...")

            # Update the 'acc_status' column in the DataFrame
            accounts_df.loc[index, 'email_acc_status'] = 1

            # Write the DataFrame back to the CSV file
            accounts_df.to_csv('../reference/accounts.csv', index=False)

process_accounts(page)


In [None]:
def create_linkedin_account(page, username, password):
    # Open a new tab
    new_tab = context.new_page()

    # Navigate to LinkedIn's registration page
    new_tab.goto('https://www.linkedin.com/start/join')
    
    # Fill in the registration form
    new_tab.fill('input[name="email-address"]', username)
    new_tab.fill('input[name="password"]', password)
    
    # Submit the form
    new_tab.click('button[type="submit"]')

    # Wait for a few seconds to allow the page to load
    time.sleep(random.randint(5, 10))

    # Return the new tab object for further operations
    return new_tab

new_tab = create_linkedin_account(page, username, password)


In [None]:
def login_linkedin(page, username, password):
    page.goto('https://www.linkedin.com/login')
    page.fill('input[name="session_key"]', username)
    time.sleep(random.randint(5, 10))
    page.fill('input[name="session_password"]', password)
    page.click('button[type="submit"]')

login_linkedin(page, bot_username, bot_password)

In [None]:
with open('../reference/scraped_profiles.txt', 'r') as file:
    scraped_profiles = file.read().split('\n')

with open('../reference/links_to_scrape.txt', 'r') as file:
    links_to_scrape = file.read().split('\n')
    
links_to_scrape = list(set(links_to_scrape)) # Remove duplicates by converting to set and back to list

# Remove any links that are already in scraped_profiles
links_to_scrape = [link for link in links_to_scrape if link not in scraped_profiles]

print(len(links_to_scrape))

with open('../reference/links_to_scrape.txt', 'w') as file:
    for link in links_to_scrape:
        file.write("%s\n" % link)

with open('../reference/links_to_scrape.txt', 'r') as file:
    collected_links = file.read().split('\n')

In [None]:
def scrape_all(page, linkedin_urls, num_to_scrape=None):
    if num_to_scrape is None:
        num_to_scrape = len(linkedin_urls)

    # Create a new context with the extra HTTP headers
    # The referrer link is now a random page from the search results for "data analyst"
    random_page = random.randint(2, 100)
    context = browser.new_context(extra_http_headers={
        'Referer': f'https://www.linkedin.com/search/results/people/?keywords=data%20analyst&page={random_page}&sid=3Ue'
    })

    # Use the new context to create a new page
    page = context.new_page()

    for url in linkedin_urls[:num_to_scrape]:
        try:
            page.goto(url)
            print('scraping' + url)
            time.sleep(random.randint(10, 30))

            # Get the page source and save it as a .txt file
            page_source = page.content()
            # Extract the profile name from the member link
            profile_name = url.rstrip('/').split('/')[-1]
            # Use the profile name to name the .txt file
            with open(f'../data/{profile_name}_page_source.txt', 'w') as f:
                f.write(page_source)

            # Check if 'sign in' is in the page title
            if 'sign in' in page.title().lower():
                # If 'sign in' is in the title, abort the loop
                break

        except Exception as e:
            print(f"Error: {e}")

    # Close the page and context when done
    page.close()
    context.close()



scrape_all(page, linkedin_urls, num_to_scrape=50)