In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

def scrape_word_data(word):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    url = f'https://www.oxfordlearnersdictionaries.com/definition/english/{word}'
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the parent element that contains the definition
    parent_element = soup.find('li', class_='sense')

    if parent_element:
        # Extract meaning
        meaning_element = parent_element.find('span', class_='def')
        meaning = meaning_element.text if meaning_element else "NOT FOUND"

        # Extract example sentences within the parent element
        examples_elements = parent_element.find_all('span', class_='x')
        examples = ["NOT FOUND", "NOT FOUND", "NOT FOUND"]
        for i, example_element in enumerate(examples_elements[:3]):
            examples[i] = example_element.text

    else:
        meaning = "NOT FOUND"
        examples = ["NOT FOUND", "NOT FOUND", "NOT FOUND"]

    # Adding delay
    time.sleep(1)

    return meaning, examples


# Function to parse words from the given text data
def parse_words(text_data):
    text_data = text_data.replace('.', '').replace(',', '')
    ignore_set = {'adj', 'v', 'n', 'B2', 'C1', 'prep', 'adv', 'conj', 'B1', 'C2', 'A1', 'A2'}  # Add more if necessary
    parts = text_data.split()
    return [part for part in parts if part not in ignore_set and not part.startswith('(') and not part.endswith(')')]


# Read text data from file
file_name = 'oxfordwords5000.txt'  # Replace with your text file name
with open(file_name, 'r') as file:
    text_data = file.read()

# Parse words
words = parse_words(text_data)
total_words = len(words)

# Scrape data and write to CSV
word_data = []

# Time tracking
start_time = time.time()

for index, word in enumerate(words, start=1):
    meaning, examples = scrape_word_data(word)
    word_data.append([word, meaning, *examples])

    # Display progress
    elapsed_time = time.time() - start_time
    progress_percentage = (index / total_words) * 100
    estimated_total_time = elapsed_time / (index) * total_words
    estimated_time_remaining = estimated_total_time - elapsed_time

    # Convert estimated time remaining to hours and minutes
    hours_remaining, seconds_remaining = divmod(estimated_time_remaining, 3600)
    minutes_remaining, seconds_remaining = divmod(seconds_remaining, 60)

    print(f"Progress: {progress_percentage:.2f}%, Estimated Time Remaining: {int(hours_remaining):02d}:{int(minutes_remaining):02d}:{int(seconds_remaining):02d}")


# Save to CSV
df = pd.DataFrame(word_data, columns=['Word', 'Meaning', 'Example1', 'Example2', 'Example3'])
df.to_csv('word_data.csv', index=False)
