In [11]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

data = {
    'description': [
        'Eco-friendly bamboo toothbrush',
        'Sustainable organic cotton t-shirt',
        'Reusable stainless steel water bottle',
        'Biodegradable natural dish soap',
        'Recyclable paper shopping bag',
        'Solar-powered LED garden lights',
        'Electric vehicle home charging station',
        'Organic vegan protein powder',
        'Upcycled woolen sweater',
        'Plant-based compostable cutlery',
        'Single-use plastic straws',
        'Disposable plastic water bottles',
        'Chemical-based household cleaners',
        'Non-recyclable Styrofoam cups',
        'Gasoline-powered lawn mower',
        'Traditional incandescent light bulbs',
        'Leather shoes from non-sustainable sources',
        'Plastic wrap',
        'Battery-operated plastic toys',
        'Fast fashion clothing',
        'Hand-powered coffee grinder',
        'LED energy-saving lightbulbs',
        'Wind turbine home power kit',
        'Solar-powered smartphone charger',
        'Bicycle with recycled aluminum frame',
        'Furniture made from reclaimed wood',
        'Biodegradable plant pots',
        'Natural rubber yoga mat',
        'Eco-friendly laundry detergent',
        'Recycled glass food storage containers',
        'Hemp-based biodegradable phone case',
        'Rainwater collection and filtration system',
        'Compost bin for kitchen waste',
        'Natural fiber area rugs',
        'Thermal insulating window treatments',
        'Drought-resistant garden seeds',
        'Electric commuter bicycle',
        'Solar oven for outdoor cooking',
        'Rechargeable AA and AAA batteries',
        'Eco-friendly insulation material for homes',
        'Standard non-LED holiday lights',
        'Battery-powered disposable plastic toothbrush',
        'Plastic non-rechargeable flashlight',
        'Conventionally grown cotton jeans',
        'Non-biodegradable foam yoga block',
        'Single-use paper coffee cups',
        'Plastic-coated fast food packaging',
        'Disposable diapers',
        'High VOC interior paint',
        'Plastic garden hose',
        'Plastic adhesive bandages',
        'Non-recyclable coated paper plates',
        'Single-use synthetic party decorations',
        'Acrylic plastic exterior house paint',
        'Disposable plastic razors',
        'One-time use vinyl gloves',
        'Polyester shower curtain',
        'Foam mattress with non-organic materials',
        'Artificial leather handbag',
        'Inflatable PVC pool toys',

    ],
    'score': [
        85, 80, 95, 75, 70, 90, 100, 80, 85, 88,
        30, 25, 40, 15, 20, 10, 45, 35, 50, 25,
        92, 90, 93, 89, 91, 86, 84, 81, 79, 87,
        82, 94, 83, 76, 78, 77, 96, 97, 88, 85,
        42, 34, 30, 48, 36, 39, 33, 29, 44, 37,
        45, 41, 40, 43, 38, 46, 47, 49, 50, 31,

    ]
}

# Complete the dataset to have a total number of 100 entries, adjusting the description and scores appropriately

df = pd.DataFrame(data)

In [2]:
# Split the data
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [3]:
!pip install datasets
!pip install transformers
!pip upgrade accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [4]:
from datasets import Dataset

# Create a Dataset object from the DataFrame
hf_dataset = Dataset.from_pandas(df)


In [5]:
# Split the Dataset into training and testing sets
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

In [12]:
from datasets import Dataset
from transformers import DistilBertTokenizer

# Create a Dataset object from the DataFrame
hf_dataset = Dataset.from_pandas(df)

# Split the Dataset into training and testing sets
hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42) # Added seed for reproducibility

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define the tokenize function
def tokenize_function(examples):
    # Truncate at max_length to ensure consistent sequence lengths
    return tokenizer(examples['description'], padding="max_length", truncation=True)

# Tokenize the data
tokenized_train_dataset = hf_dataset['train'].map(tokenize_function, batched=True)
tokenized_test_dataset = hf_dataset['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [14]:
def format_labels(example):
    example['labels'] = [float(example['score'])]
    return example

# Apply label formatting to train and test dataset separately
train_dataset = tokenized_train_dataset.map(format_labels)
test_dataset = tokenized_test_dataset.map(format_labels)

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [17]:
from transformers import DistilBertForSequenceClassification

# Load the pre-trained model configured for regression (num_labels=1)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
!pip install accelerate -U




In [2]:
!pip install --upgrade transformers
!pip install --upgrade accelerate

Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.1
    Uninstalling transformers-4.47.1:
      Successfully uninstalled transformers-4.47.1
Successfully installed transformers-4.48.2


In [18]:
from transformers import TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
)

In [19]:
from sklearn.metrics import mean_squared_error
from transformers import Trainer

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.squeeze(predictions)  # remove batch dimension if necessary
    return {"mse": mean_squared_error(labels, predictions)}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [20]:
# Train the model on the training data
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkarthicksaai197[0m ([33mkarthicksaai197-vellore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Mse
1,No log,4396.780762,4396.780273
2,4415.669500,4395.492676,4395.493164
3,4415.669500,4393.438965,4393.437988


TrainOutput(global_step=18, training_loss=4424.210286458333, metrics={'train_runtime': 660.2849, 'train_samples_per_second': 0.218, 'train_steps_per_second': 0.027, 'total_flos': 19074965225472.0, 'train_loss': 4424.210286458333, 'epoch': 3.0})

In [21]:
# Evaluate the model on the testing data
trainer.evaluate()

{'eval_loss': 4393.43896484375,
 'eval_mse': 4393.43798828125,
 'eval_runtime': 20.3679,
 'eval_samples_per_second': 0.589,
 'eval_steps_per_second': 0.098,
 'epoch': 3.0}

In [22]:
# Assuming you have a Trainer object named 'trainer' and a tokenizer named 'tokenizer'
# Define the path to save the model and tokenizer
model_path = './my_trained_model'

# Save the trained model
trainer.model.save_pretrained(model_path)

# Save the corresponding tokenizer used during training
tokenizer.save_pretrained(model_path)

('./my_trained_model/tokenizer_config.json',
 './my_trained_model/special_tokens_map.json',
 './my_trained_model/vocab.txt',
 './my_trained_model/added_tokens.json')

In [23]:
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification

# Load the tokenizer and model
model_path = './my_trained_model'
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Initialize the pipeline
sustainability_scorer = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True
)

# Define your query (new product description)
query = "plastic"

# Use the scorer pipeline to predict the sustainability score
result = sustainability_scorer(query)

# Extract the score from the result
# Here we're taking the first element from the result since `return_all_scores` is True
# and then accessing the regression value with the 'score' key.
sustainability_score = result[0][0]['score']

print(f"Sustainability score for '{query}': {sustainability_score}")

Device set to use cpu


Sustainability score for 'plastic': 0.14047245681285858


