In [None]:
pip install sacremoses


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import random
import nltk

papuga_model_name = 'flax-community/papuGaPT2'
device = 'cpu'

papuga_tokenizer = AutoTokenizer.from_pretrained(papuga_model_name)
papuga_model = AutoModelForCausalLM.from_pretrained(papuga_model_name).to(device)

def log_probs_from_logits(logits, labels):
    logp = torch.nn.functional.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

def sentence_prob(sentence_txt):
    input_ids = papuga_tokenizer(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    with torch.no_grad():
        output = papuga_model(input_ids=input_ids)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().numpy()

def positive_or_negative(sentence_text):
    pos_prob = sentence_prob(sentence_text + " bardzo polecam")
    neg_prob = sentence_prob(sentence_text + " nie polecam")
    return 1 if neg_prob > pos_prob else 0


herbert_name = "allegro/herbert-base-cased"
herbert_tokenizer = AutoTokenizer.from_pretrained(herbert_name)
herbert_model = AutoModel.from_pretrained(herbert_name)


def extract_features_bert(text):
    inputs = herbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = herbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :].detach().numpy()
    return cls_embedding.flatten()


with open('/content/drive/My Drive/chiny_kicza/reviews_for_task3.txt', 'r') as f:
    lines = f.readlines()

positive_lines_list = []
negative_lines_list = []

for line in lines:
    words = line.split()
    if words[0] == 'GOOD':
        positive_lines_list.append(' '.join(words[1:]))
    else:
        negative_lines_list.append(' '.join(words[1:]))


positive_samples = [(line, 1) for line in positive_lines_list]
negative_samples = [(line, 0) for line in negative_lines_list]

all_samples = positive_samples + negative_samples
random.shuffle(all_samples)

texts, labels = zip(*all_samples)


features = []
for text in texts:
    bert_embedding = extract_features_bert(text)
    papuga_output = positive_or_negative(text)
    combined_features = np.concatenate((bert_embedding, [papuga_output]))
    features.append(combined_features)

features = np.array(features)
labels = np.array(labels)


train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)


clf = LogisticRegression(max_iter=10000)
clf.fit(train_features, train_labels)


accuracy = clf.score(test_features, test_labels)
print(f"Combined Classifier Accuracy: {accuracy}")


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Combined Classifier Accuracy: 0.9


In [None]:

nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import random
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

papuga_model_name = 'flax-community/papuGaPT2'
device = 'cpu'

papuga_tokenizer = AutoTokenizer.from_pretrained(papuga_model_name)
papuga_model = AutoModelForCausalLM.from_pretrained(papuga_model_name).to(device)

def log_probs_from_logits(logits, labels):
    logp = torch.nn.functional.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

def sentence_prob(sentence_txt):
    input_ids = papuga_tokenizer(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    with torch.no_grad():
        output = papuga_model(input_ids=input_ids)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().numpy()

def perplexity(sentence_txt):
    input_ids = papuga_tokenizer(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    with torch.no_grad():
        output = papuga_model(input_ids=input_ids)
        logits = output.logits
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        nll = -log_probs.mean()
        return torch.exp(nll).item()

def positive_or_negative(sentence_text):
    pos_prob = sentence_prob(sentence_text + " bardzo polecam")
    neg_prob = sentence_prob(sentence_text + " nie polecam")
    return 1 if neg_prob > pos_prob else 0


herbert_name = "allegro/herbert-base-cased"
herbert_tokenizer = AutoTokenizer.from_pretrained(herbert_name)
herbert_model = AutoModel.from_pretrained(herbert_name)


def extract_features_bert(text):
    inputs = herbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = herbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :].detach().numpy()
    return cls_embedding.flatten()


def spoil(L):
    res = []
    for w in L.split():
        if random.random() < 0.85:
            res.append(w)
        else:
            res.append(w.upper())
    return ' '.join(res)


def synonym_replacement(text):
    words = text.split()
    new_text = []
    for word in words:
        if random.random() < 0.3:
            synonyms = wordnet.synsets(word)
            if synonyms:
                synonym = synonyms[0].lemmas()[0].name().replace('_', ' ')
                new_text.append(synonym)
            else:
                new_text.append(word)
        else:
            new_text.append(word)
    return ' '.join(new_text)


with open('/content/drive/My Drive/chiny_kicza/reviews_for_task3.txt', 'r') as f:
    lines = f.readlines()

positive_lines_list = [line.split(maxsplit=1)[1] for line in lines if line.startswith('GOOD')]
negative_lines_list = [line.split(maxsplit=1)[1] for line in lines if line.startswith('BAD')]


positive_samples = [(line.strip(), 1) for line in positive_lines_list]
negative_samples = [(line.strip(), 0) for line in negative_lines_list]


augmented_positive_samples = [
    (spoil(synonym_replacement(line)), 1) for line in positive_lines_list
]
augmented_negative_samples = [
    (spoil(synonym_replacement(line)), 0) for line in negative_lines_list
]


all_samples = positive_samples + negative_samples + augmented_positive_samples + augmented_negative_samples
random.shuffle(all_samples)

texts, labels = zip(*all_samples)


features = []
for text in texts:
    bert_embedding = extract_features_bert(text)
    papuga_output = positive_or_negative(text)
    papuga_perplexity = perplexity(text)
    combined_features = np.concatenate((bert_embedding, [papuga_output, papuga_perplexity]))
    features.append(combined_features)

features = np.array(features)
labels = np.array(labels)


train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)


clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(train_features, train_labels)


predictions = clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
print(f"Random Forest Classifier Accuracy: {accuracy}")


print("\nClassification Report:")
print(classification_report(test_labels, predictions))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expe

Random Forest Classifier Accuracy: 0.86875

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        80
           1       0.88      0.85      0.87        80

    accuracy                           0.87       160
   macro avg       0.87      0.87      0.87       160
weighted avg       0.87      0.87      0.87       160



# **ZADANIE 3**


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import random

herbert_name = "allegro/herbert-base-cased"
device = 'cpu'

herbert_tokenizer = AutoTokenizer.from_pretrained(herbert_name)
herbert_model = AutoModel.from_pretrained(herbert_name).to(device)

if herbert_tokenizer.pad_token is None:
    herbert_tokenizer.pad_token = herbert_tokenizer.eos_token

def extract_features_bert(text):
    inputs = herbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = herbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :].detach().cpu().numpy()
    return cls_embedding.flatten()

with open('/content/drive/My Drive/chiny_kicza/reviews_for_task3.txt', 'r') as f:
    lines = f.readlines()

positive_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('GOOD')]
negative_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('BAD')]

positive_samples = [(line, 1) for line in positive_lines_list]
negative_samples = [(line, 0) for line in negative_lines_list]

all_samples = positive_samples + negative_samples
random.shuffle(all_samples)

texts, labels = zip(*all_samples)

features = []
for text in texts:
    bert_embedding = extract_features_bert(text)
    features.append(bert_embedding)

features = np.array(features)
labels = np.array(labels)

train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

clf = LogisticRegression(max_iter=10000)
clf.fit(train_features, train_labels)

accuracy = clf.score(test_features, test_labels)
print(f"Classifier Accuracy: {accuracy}")


pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Classifier Accuracy: 0.675


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import random

papuga_model_name = 'flax-community/papuGaPT2'
device = 'cpu'

papuga_tokenizer = AutoTokenizer.from_pretrained(papuga_model_name)
papuga_model = AutoModelForCausalLM.from_pretrained(papuga_model_name).to(device)

def generate_augmented_reviews(base_review, k):
    augmented_reviews = []
    input_ids = papuga_tokenizer(base_review, return_tensors="pt")["input_ids"].to(device)
    for _ in range(k):
        with torch.no_grad():
            generated_ids = papuga_model.generate(
                input_ids,
                max_length=len(input_ids[0]) + 1,
                num_return_sequences=1,
                do_sample=True,
                top_k=50,
                temperature=0.7,
                pad_token_id=papuga_tokenizer.eos_token_id
            )
        generated_text = papuga_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        augmented_reviews.append(generated_text[len(base_review):].strip())
    return augmented_reviews

herbert_name = "allegro/herbert-base-cased"
herbert_tokenizer = AutoTokenizer.from_pretrained(herbert_name)
herbert_model = AutoModel.from_pretrained(herbert_name)

if herbert_tokenizer.pad_token is None:
    herbert_tokenizer.pad_token = herbert_tokenizer.eos_token

def extract_features_bert(text):
    inputs = herbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = herbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :].detach().numpy()
    return cls_embedding.flatten()

with open('/content/drive/My Drive/chiny_kicza/reviews_for_task3.txt', 'r') as f:
    lines = f.readlines()

positive_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('GOOD')]
negative_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('BAD')]

k = 1

positive_samples = [(line, 1) for line in positive_lines_list]
negative_samples = [(line, 0) for line in negative_lines_list]

augmented_positive_samples = []
for line in positive_lines_list:
    augmented_positive_samples.extend([(aug_review, 1) for aug_review in generate_augmented_reviews(line, k)])

augmented_negative_samples = []
for line in negative_lines_list:
    augmented_negative_samples.extend([(aug_review, 0) for aug_review in generate_augmented_reviews(line, k)])

all_samples = positive_samples + negative_samples + augmented_positive_samples + augmented_negative_samples
random.shuffle(all_samples)

texts, labels = zip(*all_samples)

features = []
for text in texts:
    bert_embedding = extract_features_bert(text)
    combined_features = np.concatenate((bert_embedding, [0]))
    features.append(combined_features)

features = np.array(features)
labels = np.array(labels)

train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

clf = LogisticRegression(max_iter=10000)
clf.fit(train_features, train_labels)

accuracy = clf.score(test_features, test_labels)
print(f"Combined Classifier Accuracy: {accuracy}")


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Combined Classifier Accuracy: 0.64375


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import random

herbert_name = "allegro/herbert-base-cased"
herbert_tokenizer = AutoTokenizer.from_pretrained(herbert_name)
herbert_model = AutoModel.from_pretrained(herbert_name)

def extract_features_bert(text):
    inputs = herbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = herbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :].detach().numpy()
    return cls_embedding.flatten()

def introduce_typos(text, typo_prob=0.3):
    characters = list(text)
    for i in range(len(characters)):
        if random.random() < typo_prob:
            if random.random() < 0.5:
                if i > 0:
                    characters[i], characters[i-1] = characters[i-1], characters[i]
            else:
                characters[i] = random.choice('abcdefghijklmnopqrstuvwxyz')
    return ''.join(characters)

def generate_typo_augmented_reviews(base_review, k):
    return [introduce_typos(base_review) for _ in range(k)]

with open('/content/drive/My Drive/chiny_kicza/reviews_for_task3.txt', 'r') as f:
    lines = f.readlines()

positive_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('GOOD')]
negative_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('BAD')]

k = 3

positive_samples = [(line, 1) for line in positive_lines_list]
negative_samples = [(line, 0) for line in negative_lines_list]

augmented_positive_samples = []
for line in positive_lines_list:
    augmented_positive_samples.extend([(aug_review, 1) for aug_review in generate_typo_augmented_reviews(line, k)])

augmented_negative_samples = []
for line in negative_lines_list:
    augmented_negative_samples.extend([(aug_review, 0) for aug_review in generate_typo_augmented_reviews(line, k)])

all_samples = positive_samples + negative_samples + augmented_positive_samples + augmented_negative_samples
random.shuffle(all_samples)

texts, labels = zip(*all_samples)

features = []
for text in texts:
    bert_embedding = extract_features_bert(text)
    combined_features = np.concatenate((bert_embedding, [0]))  # No Papuga output
    features.append(combined_features)

features = np.array(features)
labels = np.array(labels)

train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

clf = LogisticRegression(max_iter=10000)
clf.fit(train_features, train_labels)

accuracy = clf.score(test_features, test_labels)
print(f"Combined Classifier Accuracy: {accuracy}")


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Combined Classifier Accuracy: 0.66875


In [None]:
import random
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from transformers import AutoTokenizer, AutoModel
import nltk
import gensim.downloader as api

nltk.download('punkt')

model_w2v = api.load("word2vec-google-news-300")

def get_synonyms(word):
    try:
        similar_words = model_w2v.most_similar(word, topn=5)
        synonyms = [similar_word[0] for similar_word in similar_words]
        return synonyms
    except KeyError:
        return []

def replace_with_synonyms(text):
    words = word_tokenize(text.lower())
    new_words = []

    for word in words:
        if len(word) > 3 and word.isalpha():
            synonyms = get_synonyms(word)
            if synonyms and random.random() < 0.5:
                new_word = random.choice(synonyms)
                new_words.append(new_word)
            else:
                new_words.append(word)
        else:
            new_words.append(word)

    return ' '.join(new_words)

herbert_name = "allegro/herbert-base-cased"
herbert_tokenizer = AutoTokenizer.from_pretrained(herbert_name)
herbert_model = AutoModel.from_pretrained(herbert_name)

if herbert_tokenizer.pad_token is None:
    herbert_tokenizer.pad_token = herbert_tokenizer.eos_token

def extract_features_bert(text):
    inputs = herbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = herbert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :].detach().numpy()
    return cls_embedding.flatten()

with open('/content/drive/My Drive/chiny_kicza/reviews_for_task3.txt', 'r') as f:
    lines = f.readlines()

positive_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('GOOD')]
negative_lines_list = [line.split(maxsplit=1)[1].strip() for line in lines if line.startswith('BAD')]

positive_samples = [(line, 1) for line in positive_lines_list]
negative_samples = [(line, 0) for line in negative_lines_list]

augmented_positive_samples = [(replace_with_synonyms(line), 1) for line in positive_lines_list]
augmented_negative_samples = [(replace_with_synonyms(line), 0) for line in negative_lines_list]

all_samples = positive_samples + negative_samples + augmented_positive_samples + augmented_negative_samples
random.shuffle(all_samples)

texts, labels = zip(*all_samples)

features = []
for text in texts:
    bert_embedding = extract_features_bert(text)
    combined_features = np.concatenate((bert_embedding, [0]))
    features.append(combined_features)

features = np.array(features)
labels = np.array(labels)

train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

clf = LogisticRegression(max_iter=10000)
clf.fit(train_features, train_labels)

accuracy = clf.score(test_features, test_labels)
print(f"Combined Classifier Accuracy: {accuracy}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Combined Classifier Accuracy: 0.86875


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

text = "This is a test sentence."
tokens = tokenizer.tokenize(text)
print(tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['this', 'is', 'a', 'test', 'sentence', '.']


In [None]:
pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
import nltk

# Pobranie zasobów punkt i tagera
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import spacy
from spacy.lang.pl.examples import sentences

nlp = spacy.load("pl_core_news_sm")
doc = nlp("Ala ma kota")
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Ala ma kota
Ala PROPN nsubj
ma VERB ROOT
kota NOUN iobj


In [None]:
!python -m spacy download pl_core_news_sm

Collecting pl-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.7.0/pl_core_news_sm-3.7.0-py3-none-any.whl (20.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pl-core-news-sm
Successfully installed pl-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
pip install gensim



In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize
import nltk

import nltk
nltk.download('punkt_tab')

nltk.download('punkt')


polish_text = """
W Polsce mieszka wiele osób. Język polski jest jednym z najtrudniejszych na świecie.
Programowanie w Pythonie jest popularne także w Polsce.
"""


sentences = [simple_preprocess(sentence) for sentence in sent_tokenize(polish_text, language='polish')]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


print("Podobieństwo między 'Polsce' a 'Polsce':", model.wv.similarity("polsce", "polsce"))

print("Słowa podobne do 'Polska':", model.wv.most_similar("polsce"))



Podobieństwo między 'Polsce' a 'Polsce': 0.99999994
Słowa podobne do 'Polska': [('mieszka', 0.13725273311138153), ('popularne', 0.06797593832015991), ('jednym', 0.03364057466387749), ('programowanie', 0.00939116906374693), ('osób', 0.008315935730934143), ('pythonie', 0.004503016360104084), ('wiele', -0.0036444442812353373), ('jest', -0.010839177295565605), ('także', -0.023671656847000122), ('polski', -0.09575343877077103)]


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'data/w2v_polish_lemmas.model'