In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Assuming 'df_selected' is prepared as in your provided code

# Load your dataset
df = pd.read_csv('data/short_amazon_reviews.csv')

# Preprocess your dataset as per your initial steps (not shown here for brevity)
# Step 1: Keep only the required columns
df_selected = df[['verified', 'category', 'price', 'rating', 'reviewText']]
# Step 2: Convert 'verified' from boolean to 0 and 1
df_selected['verified'] = df_selected['verified'].astype(int)
# Step 3: Convert 'category' to numbers and save the dictionary for conversion
df_selected['category'] = df_selected['category'].astype('category')
category_mapping = dict(enumerate(df_selected['category'].cat.categories))
df_selected['category'] = df_selected['category'].cat.codes
# More thorough cleaning of the 'price' column to ensure it only contains valid numeric strings
df_selected['price'] = df_selected['price'].str.extract('(\d+\.\d+|\d+)').astype(float)
# Fill nulls in 'price' with the mean of the column, now that it's properly cleaned
df_selected['price'].fillna(df_selected['price'].mean(), inplace=True)
# Attempt to convert 'rating' to int again
df_selected['rating'] = df_selected['rating'].astype(int)
# Remove any remaining nulls from the dataframe
df_selected.dropna(inplace=True)
df_selected.head()

# Splitting the dataset into training and testing sets
# Assuming df_selected is your final preprocessed DataFrame ready for modeling
X = df_selected.drop('rating', axis=1)
y = df_selected['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to encode texts and average BERT outputs
def encode_texts(texts):
    model.eval()  # Set the model to evaluation mode
    encoded_features = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,  # Ensure we do not exceed BERT's max length
            truncation=True,  # Truncate texts that exceed the max length
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        with torch.no_grad():  # Ensure no gradients are calculated
            output = model(encoded_dict['input_ids'], encoded_dict['attention_mask'])
            # Take the mean of the embeddings for the sequence
            feature = output.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
            encoded_features.append(feature)
    return np.array(encoded_features)


# Encode the review texts
X_text_train = encode_texts(X_train['reviewText'].tolist())
X_text_test = encode_texts(X_test['reviewText'].tolist())

# Prepare non-text features, normalize as necessary
# Here, add code to preprocess and prepare non-text features from X_train and X_test

# Combine BERT-encoded text features with other features
# Assuming non-text features are prepared and stored in X_train_non_text and X_test_non_text
X_train_combined = np.hstack([X_text_train, X_train_non_text])
X_test_combined = np.hstack([X_text_test, X_test_non_text])

# Train a Random Forest classifier
clf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))
clf.fit(X_train_combined, y_train)

# Evaluate the model
print("Training accuracy:", clf.score(X_train_combined, y_train))
print("Test accuracy:", clf.score(X_test_combined, y_test))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['verified'] = df_selected['verified'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['category'] = df_selected['category'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['category'] = df_selected['category'].cat.codes
A value is try

NameError: name 'X_train_non_text' is not defined