In [None]:
%pip install pandas

import pandas as pd
%pip install torch
%pip install transformers
%pip install scikit-learn

In [None]:
%pip install datasets
from datasets import load_dataset



In [None]:
# Download a dataset
dataset = load_dataset("Glavin001/startup-interviews")


In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [6]:
print(dataset.keys())  # Print the available splits

dict_keys(['train'])


In [None]:
df = pd.read_csv('C:\\Users\\ASUS\\Downloads\\Glavin001/startup-interviews')
#df = pd.read_json('C:\\path\\to\\best_buy_interviews.json')


In [8]:


# Access the dataset splits
train_dataset = dataset['train']

# Split the train dataset into train and test sets
train_size = int(0.8 * len(train_dataset))
test_size = len(train_dataset) - train_size

train_split, test_split = torch.utils.data.random_split(train_dataset, [train_size, test_size])

# Access the train and test splits
train_data = train_split.dataset
test_data = test_split.dataset


In [None]:
# Extract relevant information for transcripts
interview_questions = df['instruction'].tolist()
candidate_responses = df['output'].tolist()

In [None]:
# Generate transcripts
transcripts = [f"Interviewer: {q}\nCandidate: {r}" for q, r in zip(interview_questions, candidate_responses)]
confidences = [1,2,3,4,5]  # List of corresponding confidences

# Store the generated transcripts
transcripts_df = pd.DataFrame({'Transcripts': transcripts})
transcripts_df.to_csv('generated_transcripts.csv', index=False)

In [None]:
# Split data into training and validation sets
transcripts_train, transcripts_val, confidences_train, confidences_val = train_test_split(
    transcripts, confidences, test_size=0.2, random_state=42
)

In [None]:

# Load the tokenizer and encode the transcripts
tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
train_encodings = tokenizer(transcripts_train, truncation=True, padding=True)
val_encodings = tokenizer(transcripts_val, truncation=True, padding=True)

In [None]:
# Convert confidences to tensors
confidences_train = torch.tensor(confidences_train)
confidences_val = torch.tensor(confidences_val)


In [None]:
# Define the model
model = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment', num_labels=1)

# Create the training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save the model and evaluation results
    num_train_epochs=5,      # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Weight decay coefficient
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=100,               # Log every 100 steps
    evaluation_strategy='epoch'      # Evaluate model at the end of each epoch
)

In [None]:
# Define a function to compute the mean squared error during training
def compute_mse(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = logits.flatten()
    return {"mse": mean_squared_error(labels, predictions)}

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=val_encodings,
    compute_metrics=compute_mse
)


In [None]:
# Fine-tune the model
trainer.train()

# Evaluate the model on the validation set
eval_result = trainer.evaluate(eval_dataset=val_encodings)
print(eval_result)

In [None]:
# Save the fine-tuned model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')
