In [1]:
import torch

After importing torch we can check our dependencies:

In [2]:
# Check if CUDA is available
cuda_available = torch.cuda.is_available()

# Print whether CUDA is available
if cuda_available:
    print("CUDA is available.")
    print(f"PyTorch is using: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Using CPU.")

# Print the device PyTorch is currently using
device = torch.device("cuda" if cuda_available else "cpu")
print(f"PyTorch is set to use: {device}")

CUDA is not available. Using CPU.
PyTorch is set to use: cpu


### Now we start to train our model
For this we created our own Dataset created in: Dataset/Data/final_dataset.csv

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load your dataset
data = pd.read_csv("Dataset/Data/final_dataset.csv")  # Replace with your actual dataset path
data['Score'] = data['Score'].astype(float)

# Split into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the ideas
def tokenize_data(data):
    return tokenizer(
        data['Idea'].tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)


### Now we have to create our own Dataset class:

In [4]:
import torch

class IdeaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, scores):
        self.encodings = encodings
        self.scores = scores

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.scores[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.scores)

train_dataset = IdeaDataset(train_encodings, train_data['Score'].tolist())
val_dataset = IdeaDataset(val_encodings, val_data['Score'].tolist())


#### Now we train:

In [10]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load BERT model with a regression head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,22.195419
2,21.545000,21.273499
3,21.545000,19.431385
4,19.042700,16.102837
5,19.042700,11.652586
6,13.019100,9.455506
7,13.019100,8.910786
8,9.075100,8.601482
9,9.075100,8.125866
10,9.695200,7.61668


TrainOutput(global_step=50, training_loss=14.4754150390625, metrics={'train_runtime': 418.1556, 'train_samples_per_second': 1.674, 'train_steps_per_second': 0.12, 'total_flos': 46044021273600.0, 'train_loss': 14.4754150390625, 'epoch': 10.0})

#### Now its time for evaluation:

In [13]:
eval_results = trainer.evaluate()
print(f"Validation Loss: {eval_results['eval_loss']}")

# Custom tests
def score_idea(idea_text):
    inputs = tokenizer(idea_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    score = outputs.logits.item()  # Get the predicted score
    return score

# Example usage
new_idea = "An app that delivers fresh, healthy meals for busy professionals"
print(f"Predicted score for the idea: {score_idea(new_idea)}")


Validation Loss: 7.616679668426514
Predicted score for the idea: 1.3295389413833618


In [11]:
new_idea = "A company that provides sustainable, biodegradable packaging solutions specifically designed for e-commerce businesses. The packaging is made from recycled materials, is compostable, and can be customized for branding. This would address growing concerns over environmental impact in the packaging industry"

print(f"Predicted score for the idea: {score_idea(new_idea)}")

new_idea = "A fitness platform that offers live, interactive classes, where trainers can adjust workout routines in real-time based on user data from wearables (like heart rate, calories burned, etc.). This approach combines virtual fitness with personalized coaching, offering a unique value proposition for fitness enthusiasts."

print(f"Predicted score for the idea: {score_idea(new_idea)}")

new_idea = "A pop-up coffee shop that operates within co-working spaces, catering to freelancers and remote workers. The business can establish partnerships with co-working spaces to share profits and offer exclusive deals to members. This could work in larger cities with a high concentration of co-working spaces."

print(f"Predicted score for the idea: {score_idea(new_idea)}")

new_idea = "A helmet designed to give users a personal sauna experience on the go. It has a built-in heating system that simulates the steam and heat of a sauna, which users can wear during daily tasks. This concept is impractical due to safety concerns and a lack of realistic use cases."


print(f"Predicted score for the idea: {score_idea(new_idea)}")


Predicted score for the idea: 1.7300198078155518
Predicted score for the idea: 1.5746402740478516
Predicted score for the idea: 1.7313016653060913
Predicted score for the idea: 1.6058305501937866
