In [1]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding
import spacy.matcher
from sklearn.model_selection import KFold
import random

# Step 1: Data Collection
def collect_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    text_data = " ".join([p.get_text() for p in soup.find_all("p")])
    return text_data

# Step 2: Data Preprocessing
def preprocess_data(text_data):
    nlp = spacy.blank("en")
    doc = nlp(text_data)
    preprocessed_data = " ".join([token.text.lower() for token in doc if not token.is_punct])
    return preprocessed_data

def annotate_data(nlp, text_data, named_entities):
    matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
    patterns = [nlp.make_doc(entity) for entity in named_entities]
    matcher.add("TerminologyList", patterns)
    
    doc = nlp(text_data)
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    filtered_spans = spacy.util.filter_spans(spans)
    entities = [(span.start_char, span.end_char, "TERMINOLOGY") for span in filtered_spans]
    
    return Example.from_dict(doc, {"entities": entities})

# Model Training
def train_model(annotated_data, iterations):
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    ner.add_label("TERMINOLOGY")

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            losses = {}
            examples = [example for example in annotated_data]
            nlp.update(examples, sgd=optimizer, drop=0.5, losses=losses)
            print(f"Iteration {itn + 1}: Loss = {losses['ner']:.4f}")
    return nlp

# Model Evaluation
def evaluate_model(nlp, test_data):
    scores = nlp.evaluate(test_data)
    return scores

def tag_raw_data(model, raw_data):
    doc = model(raw_data)
    tagged_data = [(ent.text, ent.label_) for ent in doc.ents]
    return tagged_data

def main():
    nlp = spacy.blank("en")
    training_urls = [
        "https://developers.google.com/machine-learning/crash-course/first-steps-with-tensorflow/toolkit",
        "https://en.wikipedia.org/wiki/TensorFlow",
        "https://www.tensorflow.org/learn",
        "https://en.wikipedia.org/wiki/Google_Brain",
        "https://en.wikipedia.org/wiki/Python_(programming_language)",
            # Add more URLs with relevant content
    ]
    named_entities = ["TensorFlow", "Google", "Python", "Machine Learning"]

    annotated_data = []
    for url in training_urls:
        text_data = collect_data(url)
        preprocessed_data = preprocess_data(text_data)
        annotated_data.append(annotate_data(nlp, preprocessed_data, named_entities))

    # Shuffle and split the data for cross-validation

    # random.shuffle(annotated_data)
    # Shuffle and prepare for cross-validation
    random.shuffle(annotated_data)
    kf = KFold(n_splits=5)
    
    best_f1_score = -1
    best_model_path = ""
    
    for fold, (train_index, test_index) in enumerate(kf.split(annotated_data)):
        train_data = [annotated_data[i] for i in train_index]
        test_data = [annotated_data[i] for i in test_index]
        
        # Train model on this fold's training data
        model = train_model(train_data, iterations=100)
        
        # Evaluate model on this fold's test data
        evaluation_results = evaluate_model(model, test_data)
        print(f"Fold {fold+1} Evaluation Results: {evaluation_results}")
        
        # Check if this model is the best so far
        if evaluation_results['ents_f'] > best_f1_score:
            best_f1_score = evaluation_results['ents_f']
            best_model_path = f"best_model_fold_{fold+1}"
            model.to_disk(best_model_path)
    
    print(f"Best F1 Score: {best_f1_score}")
    
    # Load the best model from disk
    best_model = spacy.load(best_model_path)
    
    # Model Deployment and Tagging with the best model
    sample_text = "TensorFlow, developed by Google, is widely used in Python programming for machine learning projects, often requiring GPU acceleration."
    tagged_data = tag_raw_data(best_model, sample_text)
    print("Tagged Data:", tagged_data)

    # kf = KFold(n_splits=5)  # 5-fold cross-validation
    # for train_index, test_index in kf.split(annotated_data):
    #     train_data = [annotated_data[i] for i in train_index]
    #     test_data = [annotated_data[i] for i in test_index]
    #     model = train_model(train_data, iterations=30)
    #     evaluation_results = evaluate_model(model, test_data)
    #     print(f"Evaluation Results: {evaluation_results}")

    # # After cross-validation, train a final model on all data
    # final_model = train_model(annotated_data, iterations=200)

    # # Model Deployment and Tagging
    # sample_text = "TensorFlow, developed by Google, is widely used in Python programming for machine learning projects, often requiring GPU acceleration."
    # tagged_data = tag_raw_data(final_model, sample_text)
    # print("Tagged Data:", tagged_data)

if __name__ == "__main__":
    main()


Iteration 1: Loss = 7464.9994
Iteration 2: Loss = 7059.8011
Iteration 3: Loss = 6612.6086
Iteration 4: Loss = 5997.3302
Iteration 5: Loss = 5230.9092
Iteration 6: Loss = 4494.6776
Iteration 7: Loss = 3462.4101
Iteration 8: Loss = 2488.3178
Iteration 9: Loss = 1726.7647
Iteration 10: Loss = 1083.8068
Iteration 11: Loss = 752.4566
Iteration 12: Loss = 709.0855
Iteration 13: Loss = 725.9492
Iteration 14: Loss = 728.5426
Iteration 15: Loss = 728.5735
Iteration 16: Loss = 726.5783
Iteration 17: Loss = 718.6747
Iteration 18: Loss = 713.4544
Iteration 19: Loss = 680.8307
Iteration 20: Loss = 637.0411
Iteration 21: Loss = 633.8739
Iteration 22: Loss = 557.0887
Iteration 23: Loss = 525.3083
Iteration 24: Loss = 543.0165
Iteration 25: Loss = 522.1714
Iteration 26: Loss = 494.0352
Iteration 27: Loss = 458.0407
Iteration 28: Loss = 479.7428
Iteration 29: Loss = 457.8049
Iteration 30: Loss = 458.6211
Iteration 31: Loss = 406.3428
Iteration 32: Loss = 391.3163
Iteration 33: Loss = 325.8084
Iteration