In [9]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
import numpy as np
import gzip
import json

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Function to load JSON data from a GZ file (one JSON per line)
def load_json_from_gz(file_path):
    reviews = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            review = json.loads(line)
            reviews.append(review)
    return reviews

# Load your data (replace with your actual file path)
file_path = '/content/Magazine_Subscriptions_5.json.gz'
reviews = load_json_from_gz(file_path)

# Reduce the dataset to 800 lines
reviews = reviews[:150]

# Extract 'reviewText' and 'overall' 
df = pd.DataFrame(reviews)
df = df[['reviewText', 'overall']]

# Label the sentiment as in the previous example
df['sentiment'] = np.where(df['overall'] >= 4, 1, 0)
df = df[['reviewText', 'sentiment']]

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Split data into train and test sets
train_size = int(len(df) * 0.8)
train_df = df[:train_size]
test_df = df[train_size:]

# Prepare data function (same as before)
def prepare_data(df):
    texts = df['reviewText'].tolist()
    labels = df['sentiment'].tolist()

    flattened_texts = []
    for text in texts:
        # Check if the text is a string
        if not isinstance(text, str):
            # Convert the text to a string if it is not already
            text = str(text)

        # Recursive function to flatten nested lists
        def flatten(nested_list):
            for item in nested_list:
                if isinstance(item, list):
                    yield from flatten(item)  # Recursively flatten
                elif isinstance(item, (str, int, float)):  # Handle strings, ints, and floats
                    yield str(item)  # Convert to string if needed
                else:
                    # If it's not a list, string, int, or float, handle it appropriately
                    # (e.g., ignore or raise an error depending on your data)
                    print(f"Warning: Encountered unexpected type: {type(item)}")
                    # yield str(item)  # You might need to convert other types to strings

        flattened_texts.append(' '.join(flatten(text)))

    encoded_texts = tokenizer(flattened_texts, truncation=True, padding='max_length', max_length=512, return_tensors='tf')

    return encoded_texts, np.array(labels)

# Prepare data for training and testing
train_texts, train_labels = prepare_data(train_df)
test_texts, test_labels = prepare_data(test_df)


In [11]:
# Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
history = model.fit(
    train_texts,
    train_labels,
    validation_data=(test_texts, test_labels),
    epochs=1,  # Adjusting epochs as needed
    batch_size=8  # Adjusting batch size as needed
)

# Evaluate the model
loss, accuracy = model.evaluate(test_texts, test_labels)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)



ValueError: Cannot generate a hashable key for IteratorSpec(({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None)),) because the _serialize() method returned an unsupproted value of type <class 'transformers.tokenization_utils_base.BatchEncoding'>