In [None]:
#from fastai.text.all import *

In [None]:
#txt="Hey, hey! It's almost a year since my last blog post. But it's a good thing it was the second to last and not the last. It's been a while since I called myself a beauty blogger (although I never really lived up to it), but it makes me feel a little bit happy to be able to write again about products I could actually recommend. "
#txt="Hey, Hey! Today was a great day!"
#spacy = WordTokenizer()

#toks = first(spacy([txt]))

#print(coll_repr(toks))

In [None]:
#tkn = Tokenizer(spacy)
#7789bbprint(coll_repr(tkn(txt), 31))

In [None]:
import requests
from bs4 import BeautifulSoup
import time


blog_url = "https://lienyastyle.blogspot.com/"

print(f"--- Stage 1: Finding all post links on {blog_url} ---")

try:
    # Fetch the main page to find links to all posts
    response = requests.get(blog_url)
    response.raise_for_status() # This will raise an error for bad responses (like 404)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all links to individual blog posts.
    # Blogspot post titles are very often in an <h3> tag with the class 'post-title'.
    # Inside that is an <a> tag which has the 'href' (the URL).
    # This selector is more specific and robust.
    post_links = [a['href'] for a in soup.select('h3.post-title a')]
    if not post_links:
        print("\nWARNING: Could not find any post links.")
        print("This might be because the HTML structure of your blog's theme is different.")
        print("Try inspecting your blog's homepage with your browser's developer tools to find the correct CSS selector for post links.")
    else:
        print(f"Found {len(post_links)} post links. Starting Stage 2...")

        all_text = ""
        # --- Stage 2: Visiting each link and scraping its content ---
        for i, link in enumerate(post_links):
            try:
                print(f"  ({i+1}/{len(post_links)}) Scraping: {link}")
                post_response = requests.get(link)
                post_response.raise_for_status()
                post_soup = BeautifulSoup(post_response.content, 'html.parser')

                # The main content is almost always in a div with the class 'post-body'.
                # This is a standard Blogspot class name.
                content_div = post_soup.select_one('.post-body')

                if content_div:
                    # .get_text() extracts all the text from the div and its children.
                    # `separator=' '` ensures words are separated by spaces, not run together.
                    # `strip=True` removes leading/trailing whitespace.
                    content = content_div.get_text(separator=' ', strip=True)
                    all_text += content + "\n\n" # Add two newlines to clearly separate posts
                else:
                    print(f"    -> Warning: Could not find '.post-body' content on this page.")

                # Be polite to the server and wait a second between requests
                time.sleep(1)

            except requests.exceptions.RequestException as e:
                print(f"    -> Failed to fetch {link} due to: {e}")
            except Exception as e:
                print(f"    -> An unexpected error occurred for {link}: {e}")

        # --- Stage 3: Save the final text ---
        if all_text:
            with open("my_blog_content.txt", "w", encoding="utf-8") as f:
                f.write(all_text)
            print("\nScraping complete! All full post content saved to 'my_blog_content.txt'")
        else:
            print("\nScraping finished, but no text was extracted.")

except requests.exceptions.RequestException as e:
    print(f"FATAL ERROR: Could not fetch the main blog URL: {e}")

--- Stage 1: Finding all post links on https://lienyastyle.blogspot.com/ ---
Found 26 post links. Starting Stage 2...
  (1/26) Scraping: https://lienyastyle.blogspot.com/2023/08/life-update.html
  (2/26) Scraping: https://lienyastyle.blogspot.com/2021/01/bezglutena-citronu-tarte-ar-meringu.html
  (3/26) Scraping: https://lienyastyle.blogspot.com/2021/01/kadel-ikvienam-vajadzetu-pastradat-par.html
  (4/26) Scraping: https://lienyastyle.blogspot.com/2018/12/ko-neviens-neteica-pirms-ieiesanas.html
  (5/26) Scraping: https://lienyastyle.blogspot.com/2020/01/zw-menstruala-piltuve-must-have.html
  (6/26) Scraping: https://lienyastyle.blogspot.com/2020/02/visbiezak-lietotie-produkti-gada.html
  (7/26) Scraping: https://lienyastyle.blogspot.com/2019/06/chapter-7-life-update.html
  (8/26) Scraping: https://lienyastyle.blogspot.com/2018/12/visbiezak-lietotie-produkti-gada.html
  (9/26) Scraping: https://lienyastyle.blogspot.com/2018/10/5-iecienitakie-lupu-balzami.html
  (10/26) Scraping: https:/

In [None]:
# Cell 1: Setup and Data Loading

from fastai.text.all import *

# Load all the text you scraped from your blog into a single variable
try:
    with open('my_blog_content.txt', 'r', encoding='utf-8') as f:
        text = f.read()
    print(f"Successfully loaded {len(text.split())} words from your blog.")
except FileNotFoundError:
    print("ERROR: 'my_blog_content.txt' not found. Please make sure you have run the scraper and the file exists.")
    # You might need to upload the file manually if the scraper failed.

# fastai's DataBlock API works best with a pandas DataFrame
df = pd.DataFrame({'text': [text]})

Successfully loaded 21024 words from your blog.


In [None]:
# Load all the text you scraped
try:
    with open('my_blog_content.txt', 'r', encoding='utf-8') as f:
        # We'll treat each post as a separate document by splitting on the double newline
        all_posts = f.read().split('\n\n')
        # Filter out any empty strings that might result from the split
        all_posts = [post for post in all_posts if post.strip()]
    print(f"Successfully loaded {len(all_posts)} posts.")
except FileNotFoundError:
    print("ERROR: 'my_blog_content.txt' not found. Please make sure the file exists.")
    all_posts = []

if len(all_posts) > 0:
    # Create a pandas DataFrame from your list of posts
    df = pd.DataFrame({'text': all_posts})

    # --- THE FIX: Create an 'is_valid' column ---
    # Let's use the last 3 posts (or ~10-20% of your data) for validation.
    # This is much more reliable than a random split on a small dataset.
    num_valid = max(1, int(len(df) * 0.2)) # Use at least 1, or 20% for validation
    print(f"Using {num_valid} posts for validation.")

    df['is_valid'] = False
    df.loc[len(df)-num_valid:, 'is_valid'] = True

    print("\nData prepared. Here's a look at the DataFrame:")
    print(df.tail()) # Show the last few rows to see the 'is_valid' flag
else:
    print("No posts were loaded. Cannot proceed.")

Successfully loaded 26 posts.
Using 5 posts for validation.

Data prepared. Here's a look at the DataFrame:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       text  \
21  Čau, čau! Sveiciens Jūlijā! Man jau ir tā kā ir ar to tukšo toverīšu krāšanu, negribās lai aizņem brīvo vietu un metu uzreiz ārā. Šoreiz saņēmos un sakrāju, vienīgi vienu lietu gan pus stundu meklēju, man gan bija prāta aptumsums, jo, nu nejau es izmetu... atradās zem gultas! :D P

In [None]:
if 'df' in locals():
     # Create the DataBlock "recipe"
     dls = DataBlock(
         blocks=TextBlock.from_df('text', is_lm=True),
         get_x=ColReader('text'),
         splitter=ColSplitter('is_valid') # <--- Use the 'is_valid' column
     ).dataloaders(df, bs=32, seq_len=80) # Using a smaller batch size for the small dataset

     print("\nDataLoaders created successfully.")
     print("Training set size:", len(dls.train_ds))
     print("Validation set size:", len(dls.valid_ds))
     dls.show_batch(max_n=4)
else:
     print("DataFrame not created. Please run the cell above first.")



DataLoaders created successfully.
Training set size: 21
Validation set size: 5


Unnamed: 0,text,text_
0,"xxbos čau , čau ! xxmaj vasaras sākumā kā visiem , tā arī man šķita , ka tā ies ļoti ilgi un xxup tik daudz xxunk kaut ko izdarīt . xxmaj xxunk , xxunk , dažāda xxunk , xxunk no xxunk tā arī es xxunk xxunk xxunk , bet nekā , xxunk . : d xxmaj xxunk rakstīt par xxunk lietām , jā , skola ir pēc 12 dienām un šajā laika xxunk ir tik daudz kas xxunk , lai","čau , čau ! xxmaj vasaras sākumā kā visiem , tā arī man šķita , ka tā ies ļoti ilgi un xxup tik daudz xxunk kaut ko izdarīt . xxmaj xxunk , xxunk , dažāda xxunk , xxunk no xxunk tā arī es xxunk xxunk xxunk , bet nekā , xxunk . : d xxmaj xxunk rakstīt par xxunk lietām , jā , skola ir pēc 12 dienām un šajā laika xxunk ir tik daudz kas xxunk , lai xxunk"
1,"! xxunk , cik xxunk un xxunk xxunk , bet tajā pat laikā xxunk no manas xxunk . ūdens ir xxunk dienas xxunk , katru dienu xxunk vismaz 2 - xxunk , lai xxunk būtu pietiekami xxunk . xxmaj ir jau daudz xxunk , cik xxunk ir xxunk xxunk xxunk xxunk xxunk , tāpēc vislabāk ir lietot xxunk xxunk , kā piemēram xxunk , my xxunk , vai xxunk ( tā ir xxunk un xxunk ) . xxmaj piemēram ,","xxunk , cik xxunk un xxunk xxunk , bet tajā pat laikā xxunk no manas xxunk . ūdens ir xxunk dienas xxunk , katru dienu xxunk vismaz 2 - xxunk , lai xxunk būtu pietiekami xxunk . xxmaj ir jau daudz xxunk , cik xxunk ir xxunk xxunk xxunk xxunk xxunk , tāpēc vislabāk ir lietot xxunk xxunk , kā piemēram xxunk , my xxunk , vai xxunk ( tā ir xxunk un xxunk ) . xxmaj piemēram , manā"
2,"nosaukums , tomēr xxunk kā man tās patīk saukt … xxmaj gandrīz xxunk xxmaj xxunk ir tas , ka šai xxunk xxunk no vienas xxunk xxunk - viņai mamma bija xxunk bizītes kā 3 . xxunk , un xxunk savās xxunk izdomāju pamēģināt .. xxmaj tātad sākumā mati xxunk 4 . xxunk - uz pusēm un xxunk uz pusēm . xxup tā kā frizūra tiek xxunk tikai xxunk , xxunk , xxunk matus xxunk lai xxunk . xxmaj katrā xxunk",", tomēr xxunk kā man tās patīk saukt … xxmaj gandrīz xxunk xxmaj xxunk ir tas , ka šai xxunk xxunk no vienas xxunk xxunk - viņai mamma bija xxunk bizītes kā 3 . xxunk , un xxunk savās xxunk izdomāju pamēģināt .. xxmaj tātad sākumā mati xxunk 4 . xxunk - uz pusēm un xxunk uz pusēm . xxup tā kā frizūra tiek xxunk tikai xxunk , xxunk , xxunk matus xxunk lai xxunk . xxmaj katrā xxunk xxunk"
3,"xxmaj xxunk xxmaj xxunk xxunk ir tā pati , kas nivea , tikai ar šo xxunk to produktus xxunk xxmaj vācijā . šim ir xxunk , patīkama xxunk , salda smarža un laba xxunk kādreiz xxunk xxunk ) . xxmaj es parasti ar xxunk xxunk xxunk , lieki xxunk xxunk . xxmaj lai gan xxunk , cik tas ir pret visām xxunk xxunk , xxunk problēmu to darīt , ja esmu vienīgā xxunk . xxmaj lūpu xxunk xxunk pie xxunk","xxunk xxmaj xxunk xxunk ir tā pati , kas nivea , tikai ar šo xxunk to produktus xxunk xxmaj vācijā . šim ir xxunk , patīkama xxunk , salda smarža un laba xxunk kādreiz xxunk xxunk ) . xxmaj es parasti ar xxunk xxunk xxunk , lieki xxunk xxunk . xxmaj lai gan xxunk , cik tas ir pret visām xxunk xxunk , xxunk problēmu to darīt , ja esmu vienīgā xxunk . xxmaj lūpu xxunk xxunk pie xxunk xxunk"


In [None]:
if 'dls' in locals():
    learn = language_model_learner(
        dls, AWD_LSTM,
        metrics=[accuracy, Perplexity()],
        wd=0.1
    ).to_fp16()

    print("\nLearner created. Starting the fine-tuning process...")
    learn.fine_tune(30,3e-1)
else:
    print("DataLoaders not created. Please run the cells above first.")



Learner created. Starting the fine-tuning process...


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,5.502203,4.991991,0.180997,147.229279,00:00


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.863141,4.180919,0.284892,65.425934,00:00
1,4.340847,4.095274,0.310133,60.05579,00:00
2,4.045149,4.17441,0.310375,65.001472,00:00
3,3.830926,4.246478,0.304779,69.85894,00:00
4,3.676864,4.282403,0.30031,72.414246,00:00
5,3.547622,4.423741,0.29352,83.407761,00:00
6,3.432133,4.430027,0.290338,83.933685,00:00
7,3.325464,4.473835,0.295887,87.692337,00:00
8,3.209441,4.60501,0.284961,99.983986,00:00
9,3.09588,4.639285,0.2831,103.470352,00:00


In [None]:
# Cell 4: Generate New Text

# Save your model's learned style for later use
learn.save('my_blog_style_model')

print("\n--- Model is ready to generate text! ---")

# Add this to the top of your text generation cell to hide the warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# A shorter prompt
prompt = "Man tiešām ļoti patīk"
num_words = 50

print("- higher temperature (more creative)")
print(learn.predict(prompt, n_words=num_words, temperature=0.9))

print("\n- lower temperature (more focused)")
print(learn.predict(prompt, n_words=num_words, temperature=0.6))


--- Model is ready to generate text! ---
- higher temperature (more creative)


Man tiešām ļoti patīk joprojām , ka man jau sāka 15 žēl , jo visu laiku meklē patiešām labu balzāmu no mīļākās . No šiem vislabāk man patika Paralēli , jo tā kā man  ▁ patīk šis skropstu produktiem , kā rezultātā kosmētika nonāca ātrāk un vieglāk . Taču nu esmu

- lower temperature (more focused)


Man tiešām ļoti patīk joprojām , ka man patīk matemātika , nedaudz tīra . JĀ , es ikdienā patīkams , un man pašai ir savs dzīves . Taču , lai arī ko es vienmēr ir viena no precēm , kura ir vienkārši lieliska par Jēkabpili , ja arī mani jauka ilgāk


In [None]:
vocab = dls.vocab
print(f"The vocabulary contains {len(vocab)} unique tokens.")

The vocabulary contains 1240 unique tokens.


In [None]:

print("First 30 tokens in the vocabulary:")
print(vocab[:30])

First 30 tokens in the vocabulary:
['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', ',', '.', 'un', 'ir', 'es', 'ka', 'ar', 'bet', 'man', '-', 'kā', 'par', 'to', ')', 'no', 'arī', 'jo', 'tā', 'uz', 'tas', 'ja']


In [None]:
word = vocab[170]
print(f"The word at index 50 is: '{word}'")

The word at index 50 is: 'vietā'
