In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
!pip install pandas numpy scikit-learn catboost transformers torch joblib tqdm




In [17]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import re
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = "/content/drive/MyDrive/Reviews.csv"
RANDOM_STATE = 42
TEST_SIZE = 0.05
POSITIVE_THRESHOLD = 0.7

dtypes = {
    'Id': 'int32',
    'ProductId': 'category',
    'UserId': 'category',
    'Score': 'int8',
    'Time': 'int32',
    'HelpfulnessNumerator': 'int16',
    'HelpfulnessDenominator': 'int16'
}
df = pd.read_csv(
    DATA_PATH,
    dtype=dtypes,
    usecols=['Score', 'Text'],
    nrows=568000
)

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', str(text))
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower().strip()

print("Cleaning text data...")
df['Text'] = df['Text'].apply(clean_text)
df = df[df['Text'].str.len() > 5]



df['sentiment'] = np.where(
    df['Score'] <= 2, 'negative',
    np.where(df['Score'] == 3, 'neutral', 'positive')
)


class_counts = df['sentiment'].value_counts()
print("Original class distribution:\n", class_counts)

pos_samples = class_counts['positive']
target_pos = int((class_counts.sum() - pos_samples) * POSITIVE_THRESHOLD / (1 - POSITIVE_THRESHOLD))
if target_pos < pos_samples:
    df_pos = df[df['sentiment'] == 'positive'].sample(n=target_pos, random_state=RANDOM_STATE)
    df_balanced = pd.concat([
        df[df['sentiment'] != 'positive'],
        df_pos
    ])
else:
    df_balanced = df


class_weights = {
    'negative': 1 / (df_balanced['sentiment'] == 'negative').mean(),
    'neutral': 1 / (df_balanced['sentiment'] == 'neutral').mean(),
    'positive': 1 / (df_balanced['sentiment'] == 'positive').mean()
}

total = sum(class_weights.values())
class_weights = {k: v * 3 / total for k, v in class_weights.items()}
print("Class weights:", class_weights)



Cleaning text data...
Original class distribution:
 sentiment
positive    443434
negative     81956
neutral      42609
Name: count, dtype: int64
Class weights: {'negative': np.float64(0.9359155774904834), 'neutral': np.float64(1.800180644202165), 'positive': np.float64(0.2639037783073516)}


In [18]:
X_train_df1 = pd.DataFrame({'review_text': X_train.reset_index(drop=True)})
X_test_df = pd.DataFrame({'review_text': X_test.reset_index(drop=True)})

train_pool = Pool(
    data=X_train_df1,
    label=y_train,
    text_features=['review_text']
)

test_pool = Pool(
    data=X_test_df,
    label=y_test,
    text_features=['review_text']
)

class_names = sorted(y_train.unique())
print(f"CatBoost class order: {class_names}")

weight_map = {
    'negative': class_weights['negative'],
    'neutral': class_weights['neutral'],
    'positive': class_weights['positive']
}
class_weights_ordered = [weight_map[c] for c in class_names]

print(f"Ordered class weights: {dict(zip(class_names, class_weights_ordered))}")


NameError: name 'X_train' is not defined

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import Pool
import re

print("Loading data")

DATA_PATH = "/content/drive/MyDrive/Reviews.csv"

df = pd.read_csv(
    DATA_PATH,
    usecols=['Score', 'Text'],
    nrows=568000
)

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', str(text))
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower().strip()

print("Cleaning text")
df['Text'] = df['Text'].apply(clean_text)
df = df[df['Text'].str.len() > 5]

print("Creating labels")
df['sentiment'] = np.where(
    df['Score'] <= 2, 'negative',
    np.where(df['Score'] == 3, 'neutral', 'positive')
)

print("\nClass distribution:")
print(df['sentiment'].value_counts())


from collections import Counter
counts = Counter(df['sentiment'])
target_size = max(counts['negative'], counts['neutral']) * 3

df_balanced = pd.concat([
    df[df['sentiment'] == 'negative'],
    df[df['sentiment'] == 'neutral'],
    df[df['sentiment'] == 'positive'].sample(n=target_size, random_state=42)
]).reset_index(drop=True)

print(f"\nAfter balancing: {len(df_balanced)} samples")
print(df_balanced['sentiment'].value_counts())

print("\nSplitting into train/test sets...")

X = df_balanced['Text']
y = df_balanced['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.05,
    random_state=42,
    stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
print("\nConverting text data to DataFrame format for CatBoost...")
X_train_df1 = pd.DataFrame({'review_text': X_train.reset_index(drop=True)})
X_test_df = pd.DataFrame({'review_text': X_test.reset_index(drop=True)})

train_pool = Pool(
    data=X_train_df1,
    label=y_train,
    text_features=['review_text']
)

test_pool = Pool(
    data=X_test_df,
    label=y_test,
    text_features=['review_text']
)

class_names = sorted(y_train.unique())
print(f"CatBoost class order: {class_names}")

weight_map = {
    'negative': class_weights['negative'],
    'neutral': class_weights['neutral'],
    'positive': class_weights['positive']
}
class_weights_ordered = [weight_map[c] for c in class_names]

print(f"Ordered class weights: {dict(zip(class_names, class_weights_ordered))}")


In [None]:
text_processing = (
    "TextProcessing:type=BM25;"
    "tokenizers=[{"
        "tokenizer_id='tok1',"
        "delimiter=' ',"
        "token_types='Word,Number,Url,Email',"
        "lowercasing=true"
    "}],"
    "dictionaries=[{"
        "dictionary_id='dict1',"
        "gram_order=1,"
        "max_dictionary_size=50000"
    "},{"
        "dictionary_id='dict2',"
        "gram_order=2"
    "}]"
)

import catboost
print(f"CatBoost version: {catboost.__version__}")
if catboost.__version__ < '1.2.0':
    !pip install -U catboost


task_type = "GPU" if 'COLAB_GPU' in globals() else "CPU"
print(f"Using task type: {task_type}")

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1:average=Macro',
    class_weights=class_weights_ordered,
    task_type=task_type,
    devices='0' if task_type == "GPU" else None,
    thread_count=-1,
    early_stopping_rounds=50,
    verbose=100,
    random_seed=RANDOM_STATE,
    class_names=class_names,
    text_processing=text_processing
)

try:
    print("\nStarting model training ")
    model.fit(
        train_pool,
        eval_set=test_pool,
        plot=True,
        use_best_model=True
    )
    print("\nTraining completed successfully!")
    print(f"Final model has {model.tree_count_} trees")


    if not model.is_fitted():
        raise RuntimeError("Model not fitted")

except Exception as e:
    print(f"\nTRAINING FAILED: {str(e)}")
    print("Common fixes:")
    raise

y_pred = model.predict(X_test_df)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=3))

macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f"\n F1-Score: {macro_f1:.3f}")

model.save_model("review_pred.cbm")
print("\nModel saved")

In [None]:
from catboost import CatBoostClassifier
import pandas as pd

model = CatBoostClassifier()
model.load_model("review_pred.cbm")

print("Model loaded")
print(f"Model has {model.tree_count_} trees")

In [None]:
def predict_sentiment(text):

    import re
    def clean_text(txt):
        txt = re.sub(r'<[^>]+>', '', str(txt))
        txt = re.sub(r'http\S+', '', txt)
        txt = re.sub(r'[^a-zA-Z\s]', '', txt)
        return txt.lower().strip()

    cleaned = clean_text(text)

    sample_df = pd.DataFrame({'review_text': [cleaned]})

    pred_class = model.predict(sample_df)[0]

    pred_proba = model.predict_proba(sample_df)[0]
    classes = model.classes_
    proba_dict = dict(zip(classes, pred_proba))

    return pred_class, proba_dict


In [None]:

text1 = "This product broke after one day. Complete waste of money. Do not buy!"
pred1, proba1 = predict_sentiment(text1)
print(f"Review: {text1}")
print(f"Predicted: {pred1}")
print(f"Probabilities: {proba1}\n")

text2 = "It works fine. Not amazing, not terrible. Does what it should."
pred2, proba2 = predict_sentiment(text2)
print(f"Review: {text2}")
print(f"Predicted: {pred2}")
print(f"Probabilities: {proba2}\n")

text3 = "Absolutely love this! Best purchase I’ve made all year. Fast shipping and perfect quality!"
pred3, proba3 = predict_sentiment(text3)
print(f"Review: {text3}")
print(f"Predicted: {pred3}")
print(f"Probabilities: {proba3}\n")

In [None]:
test_reviews = [
    "Worst thing I ever bought",
    "Okay for the price",
    "Highly recommend!",
    "Not worth it",
    "Just average",
    "love it! the best broom i've ever used",
    "nothing good about this product",
    "good for the price, but would recommend other options for long term usage",
    "the best product there is on the market",
    "shit",
    "the worst, dont buy, its a replica",
    "meh",
    "fine if you're a broke ass who cant afford anything else",
    "i dont even know where to start with the issues, they are everywhere",
    "fast shipping, good retailer!",
    'good, but with caveats',
    'can recommend',
    'can\'t recommend this product',
    "seller was rude in chat, but other than that i'm satisfied with product overall",
    "okayish, nothing special to be expected for the price",
    "wonderful! solves the problem well",
    "fake advertisement",
    "great",
    "bad",
    "overpriced, look elsewhere",
    "time to market reduced thanks to it!",
    "buggy software, wait a few months for a fix if you want to evade the headache",
    "user experience is lacking",
    "UX is hit or miss, but ok"
]

results = []
for text in test_reviews:
    pred, proba = predict_sentiment(text)
    results.append({
        'text': text,
        'prediction': pred,
        'conf_negative': proba['negative'],
        'conf_neutral': proba['neutral'],
        'conf_positive': proba['positive']
    })


results_df = pd.DataFrame(results)
print(results_df[['text', 'prediction']].to_string(index=False))