In [None]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
from fastai.text.all import *

In [None]:
print(torch.backends.mps.is_built()) # Apple M-series metal-performance-shaders-framework
print(torch.backends.mps.is_available()) # Apple M-series metal-performance-shaders-framework

mps_device = default_device()
print(mps_device)

In [None]:
path = untar_data(URLs.IMDB) # https://docs.fast.ai/data.external.html

In [None]:
path.ls()
(path/'train').ls()

<img src="imdb_finderview.png" alt="IMDB dataset on disk" width="200"/>
<img src="imdb_observationexample.png" alt="IMDB dataset example" height="200"/>

In [None]:
import shutil
#from pathlib import Path

def create_subset(src, dest, num_samples=256):
    dest.mkdir(parents=True, exist_ok=True)
    files = list(src.glob('*'))[:num_samples]
    for file in files:
        shutil.copy(file, dest/file.name)


train_unsup = path/'unsup'
train_pos = path/'train'/'pos'
train_neg = path/'train'/'neg'
test_pos = path/'test'/'pos'
test_neg = path/'test'/'neg'

# Create subset directories
top_datapath = path.parent
subset_path = top_datapath/'subset'

(subset_path/'unsup').mkdir(parents=True, exist_ok=True)
(subset_path/'train'/'pos').mkdir(parents=True, exist_ok=True)
(subset_path/'train'/'neg').mkdir(parents=True, exist_ok=True)
(subset_path/'test'/'pos').mkdir(parents=True, exist_ok=True)
(subset_path/'test'/'neg').mkdir(parents=True, exist_ok=True)

# Copy files to subset directories
create_subset(train_unsup, subset_path/'unsup')
create_subset(train_pos, subset_path/'train'/'pos')
create_subset(train_neg, subset_path/'train'/'neg')
create_subset(test_pos, subset_path/'test'/'pos')
create_subset(test_neg, subset_path/'test'/'neg')

<img src="imdb_subset_finderview.png" alt="IMDB dataset on disk" width="200"/>

In [None]:
# Prepare the dataset. Both the training set and the validation set.
datablock = DataBlock(
    blocks=(TextBlock.from_folder(subset_path), CategoryBlock), # Input is text, Output is categories (positive / negative).
    get_items=get_text_files, # Get text files in path recursively, only in folders, if specified.
    splitter=GrandparentSplitter(valid_name='test'), # Split items from the grand parent folder names (train_name and valid_name).
    get_y=parent_label, # Label item with the parent folder name.
)

dataloaders = datablock.dataloaders(subset_path, bs=16, device=mps_device) # https://docs.fast.ai/data.transforms.html

In [None]:
datablock.summary(subset_path)

In [None]:
dataloaders.show_batch(max_n=3)

In [None]:
print(type(dataloaders))
print(len(dataloaders))
print(len(dataloaders.train_ds), len(dataloaders.valid_ds))

for i, sample in enumerate(dataloaders.train_ds):
    print(sample)
    if i == 2:
        break

## Train and tune our model

In [None]:
# Train and tune our model.
learn = text_classifier_learner(dataloaders, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

In [None]:
learn.fine_tune(4, 1e-2)

In [None]:
learn.show_results()

In [None]:
# Use our model by passing it a review.
category,_,probs = learn.predict("I really liked that movie")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")

category,_,probs = learn.predict("I did not like that movie, it was awful")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")

<img src="cpu_gpu_belastning.png" alt="CPU og GPU belastning" width="800"/>

## ULMFiT

<img src="ulmfit.png" alt="ULMFiT process" width="800"/>

In [None]:
dataloaders_lm = TextDataLoaders.from_folder(subset_path/'unsup', is_lm=True, valid_pct=0.1)

In [None]:
dataloaders_lm.show_batch(max_n=3)

In [None]:
llm_learn = language_model_learner(dataloaders_lm, AWD_LSTM, metrics=[accuracy, Perplexity()], path=subset_path/'unsup', wd=0.1)

In [None]:
llm_learn.fit_one_cycle(4, 1e-2) # 0.01 | https://iconof.com/1cycle-learning-rate-policy/

In [None]:
llm_learn.save('4epoch')
# llm_learn = llm_learn.load('1epoch')

In [None]:
llm_learn.unfreeze()
llm_learn.fit_one_cycle(10, 1e-3) # 0.001 | https://iconof.com/1cycle-learning-rate-policy/

In [None]:
llm_learn.save_encoder('10epoch_finetuned')

In [None]:
print(llm_learn.predict("The man is a good", 1, temperature=0.75))

In [None]:
the_best_review_starts_with = "I liked this movie because: "
n_words = 40
n_sentences = 2
preds = [llm_learn.predict(the_best_review_starts_with, n_words, temperature=0.75) 
         for _ in range(n_sentences)]

In [None]:
print(preds)

<img src="ulmfit.png" alt="ULMFiT process" width="800"/>

## Skip this part?

<img src="nevralt_nettverk.png" alt="Nevralt nettverk" width="600"/>

In [None]:
dataloaders_classifier = TextDataLoaders.from_folder(subset_path, valid='test', text_vocab=dataloaders_lm.vocab)

In [None]:
learn_2pass = text_classifier_learner(dataloaders_classifier, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

In [None]:
encoder_path = subset_path/'unsup/models'
learn_2pass = learn_2pass.load_encoder(encoder_path/'10epoch_finetuned')

In [None]:
learn_2pass.fit_one_cycle(1, 2e-2) # 0.02 | https://iconof.com/1cycle-learning-rate-policy/

In [None]:
#print(slice(1e-2/(2.6**4),1e-2))
#print(slice(5e-3/(2.6**4),5e-3))
#print(slice(1e-3/(2.6**4),1e-3))

In [None]:
learn_2pass.freeze_to(-2) # Last two layers
learn_2pass.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2)) # epoch, lr group 0 (body), lr group 1 (head)

In [None]:
learn_2pass.freeze_to(-3) # Last three layers
learn_2pass.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3)) # epoch, lr group 0 (body), lr group 1 (head)

In [None]:
learn_2pass.unfreeze() # All layers
learn_2pass.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3)) # epoch, lr group 0 (body), lr group 1 (head)

In [None]:
# Use our model by passing it a review.
category,_,probs = learn_2pass.predict("I really liked that movie")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")

category,_,probs = learn_2pass.predict("I did not like that movie, it was awful")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")