### Import Necessary Dependencies

In [227]:
import pandas as pd
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import requests
from collections import defaultdict
import random

### Set Global Vars

In [179]:
root_url = "https://en.wikipedia.org"
iterations = 1
# With 2 I had the first set of links link to 100,717 articles...
links_set = set()
starting_page = requests.get("https://en.wikipedia.org/wiki/Main_Page")
soup = BeautifulSoup(starting_page.content, 'html.parser')
initial_links = set([a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith("/wiki") and all(["Wikipedia" not in a['href'], "Special" not in a['href'], "Template" not in a['href'], "File" not in a['href'], "Talk" not in a['href'], "Help" not in a['href'], "Portal" not in a['href'], "Main_Page" not in a['href'], "Category" not in a['href']])])
all_texts = []
all_grams = []
n = 3

In [162]:
def extract_text(iteration, new_links):
    new_links_copy = new_links.copy()
    if iteration == 0:
        return
    if iteration == iterations:
        for link in initial_links:   
            # might have to for loop with new_links here
            get_links(link, new_links)
        extract_text(iteration - 1, new_links)
    else:
        for link in new_links_copy:
            get_links(link, set())
        extract_text(iteration - 1, new_links)            

In [163]:
def get_links(link, new_links):
    print(f"extracting {link}")
    page = requests.get(f"{root_url}{link}")
    soup = BeautifulSoup(page.content, 'html.parser')
    for p in soup.find_all('p'):
        text = p.get_text(strip=True)
        if text and text != '\n' and len(text.split()) > 5:
            if text.startswith("This is an accepted version"):
                continue
            all_texts.append(text)
    links_set.add(link)
    count = 0
    first_p = soup.find('p')
    if first_p:
        for a in first_p.find_all('a', href=True):
            new_link = a['href']
            if new_link.startswith("/wiki") and all(x not in new_link for x in ["Wikipedia", "Special", "Template", "File", "Talk", "Help", "Portal", "Main_Page", "Category"]):
                if new_link not in links_set:
                    new_links.add(new_link)
                    count += 1
    print(f"Added {count} links")

In [None]:
extract_text(iterations, set())

2
extracting /wiki/Gustav_Holst
Added 403 links
extracting /wiki/JJ_(singer)
Added 379 links
extracting /wiki/Holy_Roman_Emperor
Added 635 links
extracting /wiki/Treaty_of_Campo_Formio
Added 576 links
extracting /wiki/Ng_Sui
Added 69 links
extracting /wiki/George_Floyd
Added 847 links
extracting /wiki/Dunkirk_evacuation
Added 505 links
extracting /wiki/George_Band
Added 103 links
extracting /wiki/M23_campaign_(2022%E2%80%93present)
Added 1493 links
extracting /wiki/Griffith_Park_Zoo
Added 214 links
extracting /wiki/Cillian_Murphy
Added 856 links
extracting /wiki/Belgian_Resistance
Added 259 links
extracting /wiki/Kangchenjunga
Added 812 links
extracting /wiki/President_of_Romania
Added 682 links
extracting /wiki/Heart_Lamp:_Selected_Stories
Added 29 links
extracting /wiki/Frank_Schofield
Added 86 links
extracting /wiki/Banu_Mushtaq
Added 36 links
extracting /wiki/Army_of_Italy_(France)
Added 67 links
extracting /wiki/Anna_Maria_R%C3%BCckersch%C3%B6ld
Added 25 links
extracting /wiki/Rus

In [166]:
len(all_texts)

290879

In [167]:
len(set(all_texts))

261820

In [174]:
from collections import Counter
for paragraph, count in Counter(all_texts).most_common(10):
    print(f"{count}x: {paragraph[:80]}...")

85x: ThisPas-de-Calaisgeographical article is astub. You can help Wikipedia byexpandi...
63x: ThisArras arrondissement, Pas-de-Calaisgeographical article is astub. You can he...
50x: Note: Flags indicate national team as defined underFIFA eligibility rules; some ...
24x: ThisArrondissement of Lillegeographical article is astub. You can help Wikipedia...
22x: ThisArrondissement of Avesnes-sur-Helpegeographical article is astub. You can he...
18x: This article about a place inBuenos Aires Province,Argentinais astub. You can he...
17x: Aircraft of comparable role, configuration, and era...
16x: ThisArrondissement of Cambraigeographical article is astub. You can help Wikiped...
16x: Annual Goals Progress onSafety & IntegritySee also blogs:Global Advocacy blog·Gl...
15x: ThisArrondissement of Dunkirkgeographical article is astub. You can help Wikiped...


In [255]:
model = defaultdict(lambda: defaultdict(lambda: 0))

for l in all_texts['text']:
    tokens = word_tokenize(l)
    for gram in ngrams(tokens, n):
        context = gram[:-1]
        next_word = gram[-1]
        model[context][next_word] += 1

for context in model:
    total = sum(model[context].values())
    for next_word in model[context]:
        model[context][next_word] /= total


In [None]:
def generate_text(seed, max_words=100):
    if len(seed) != n - 1:
        raise ValueError(f"Seed must be of length {n-1} for an n-gram model with n={n}.")

    output = list(seed)
    current_context = tuple(seed)

    for _ in range(max_words):
        next_words = model.get(current_context, None)
        if not next_words:
            break

        words, probs = zip(*next_words.items())
        next_word = random.choices(words, probs)[0]
        output.append(next_word)

        current_context = (*current_context[1:], next_word)

    return ' '.join(output)


In [254]:
generate_text(["armies", "in", "the"], 100)

'armies in the'

In [240]:
generate_text(["the", "thing", "is"], 100)

"the thing named by the conservative-ledConstitutional Council . [ 137 ] Charles 's daughter , `` I ’ m Dan Patrick Show , which required allegiance to theFederal Reserve Board chairmanAlan Greenspan , a steady partner . [ 140 ] Morton La Kretz Bridge has remained controversial to this date , the Design and Development Promotion Law . [ 80 ] The town grew into a castle . [ 2 ] [ 24 ] Hyundai planned to be covered with putrefying bed sores , but what constitutes courtesy , and other parts of India . [ 218 ] most notably playing Doctor"

In [248]:
generate_text(["it", "can", "be", ""], 100)

'it can be said'

In [234]:
import csv

def save_model_to_csv(model, filename=f'{n}gram_model.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['context', 'next_word', 'probability'])

        for context, next_words in model.items():
            context_str = ' '.join(context)
            for next_word, prob in next_words.items():
                writer.writerow([context_str, next_word, prob])
                
save_model_to_csv(model)

In [None]:
def load_model_from_csv(filename=f'{n}gram_model.csv'):
    model = defaultdict(lambda: defaultdict(float))
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            context = tuple(row['context'].split())
            next_word = row['next_word']
            prob = float(row['probability'])
            model[context][next_word] = prob
    return model
model = load_model_from_csv()