In [None]:
!pip install transformers

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='roberta-base')
unmasker("Hello I'm a <mask> model.")

In [None]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
text = '''Sanoat, deb xom ashyo, material, yoqilgʻi, energiya va boshqa mahsulotlar ishlab chiqaruvchi yoki aholiga xizmat koʻrsatuvchi korxonalar (zavod, fabrika, kon, shaxta, elektr stansiya, ferma va hk) majmuasiga aytiladi. Sanoat xalq xoʻjaligining muhim sohasidir.
Chet ellarda Sanoatning rivojlanishi sanoat jihatidan rivojlangan mamlakatlarning paydo boʻlishiga olib keldi. Bu mamlakatlarda iqtisodiyot taraqqiyoti katta hajmda jamgʻarilgan texnika jihatidan ilgʻor kapital asosida va mavjud yuqori malakali ishchi kuchi bilan taʼminlanadi. Ularga AQSH, Kanada, Yaponiya, koʻpgina Gʻarbiy Yevropa mamlakatlari kiradi.[1]'''
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='roberta-base')
unmasker("axoliga xizmat ko'rsatadigan  <mask> joylashgan.")

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
from torch.utils.data import Dataset, DataLoader

# Step 1: Preprocess the dataset

# Assuming you have a text dataset stored in a file named 'text_dataset.txt'
with open('text data.txt', 'r') as file:
    text_data = file.read()

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenized_text = tokenizer.tokenize(text_data)

# Step 2: Prepare the training data

# Split the tokenized_text into training, validation, and test sets as needed

# Step 3: Install the required libraries (already assumed installed)

# Step 4: Load the RoBERTa model
model = RobertaForMaskedLM.from_pretrained('roberta-base')

# Step 5: Fine-tune the model

# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, tokenized_text):
        self.inputs = tokenizer.convert_tokens_to_ids(tokenized_text)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx]

# Create instances of the custom dataset class
train_dataset = TextDataset(tokenized_text)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 20

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs = batch.unsqueeze(dim=0)
        labels = batch.unsqueeze(dim=0)

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print('Epoch >',epoch)

In [None]:
# Step 6: Evaluate the model

model.eval()  # Set the model to evaluation mode

validation_loss = 0.0
total_predictions = 0
correct_predictions = 0

with torch.no_grad():
    for batch in validation_loader:  # Assuming you have a DataLoader for the validation set
        inputs = batch.unsqueeze(dim=0)
        labels = batch.unsqueeze(dim=0)

        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        validation_loss += loss.item()

        predicted_indices = outputs.logits.argmax(dim=-1)
        total_predictions += len(labels)
        correct_predictions += (predicted_indices == labels).sum().item()

validation_loss /= len(validation_loader)
accuracy = correct_predictions / total_predictions

print(f"Validation Loss: {validation_loss:.4f}")
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Step 7: Test the model

model.eval()  # Set the model to evaluation mode

test_loss = 0.0
total_predictions = 0
correct_predictions = 0

with torch.no_grad():
    for batch in test_loader:  # Assuming you have a DataLoader for the test set
        inputs = batch.unsqueeze(dim=0)
        labels = batch.unsqueeze(dim=0)

        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

        predicted_indices = outputs.logits.argmax(dim=-1)
        total_predictions += len(labels)
        correct_predictions += (predicted_indices == labels).sum().item()

test_loss /= len(test_loader)
accuracy = correct_predictions / total_predictions

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
#using model
from transformers import RobertaForMaskedLM, RobertaTokenizer

model = RobertaForMaskedLM.from_pretrained('path_to_fine_tuned_model')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
text = "Amir temur  [MASK] bilan jang qilgan."

encoded_input = tokenizer.encode_plus(text, return_tensors='pt')
input_ids = encoded_input['input_ids']

with torch.no_grad():
    predictions = model(input_ids)[0]

masked_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
predicted_token_ids = torch.argmax(predictions[0, masked_index], dim=-1)
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

print(f"Predicted token: {predicted_tokens[0]}")

In [None]:
text = "Aynan Timur davrida Samarqand [MASK] bo’lib qolgan "

encoded_input = tokenizer.encode_plus(text, return_tensors='pt')
input_ids = encoded_input['input_ids']

with torch.no_grad():
    output = model.generate(input_ids, max_length=50, num_return_sequences=5)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Generated text: {generated_text}")
