# Classifying positive/negative comments

In [48]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
import spacy
#!python3 -m spacy download pt_core_news_sm

In [49]:
# Reading files
path = '/Users/irina/Desktop/Brazilian_E-Commerce_by_Olist/'
files = ['order_items', 'order_reviews']
for file in files:
    file_path = path + file + '.csv' 
    globals()[file] = pd.read_csv(file_path)
# Leaving the necessary columns
order_items=order_items[['order_id', 'product_id', 'seller_id']]
order_scores = order_reviews[['order_id', 'review_score']]
# Merging tables
merged = pd.merge(order_items, order_scores, how = 'left', left_on = 'order_id', right_on = 'order_id' )

In [50]:
# Leaving the necessary columns
reviews = order_reviews[['review_comment_message', 'review_score']]
# Deleting reviews with no text comment
reviews = reviews.dropna()

As the data is not labled, I'll try using score to lable it to be able to train classification model.

In [51]:
# Creating lables for review data depending on the score. 1-2: negative, 3: neutral, 4-5: positive
def label(row):
        if row['review_score'] > 3:
            return 'positive'
        elif row['review_score'] == 3:
            return 'neutral'
        else: 
            return 'negative'
reviews['label'] = reviews.apply(label, axis = 1 )

In [52]:
# Encoding lables
label_encoder = LabelEncoder()
reviews['enc_label'] = label_encoder.fit_transform(reviews['label'])

In [53]:
reviews.head()

Unnamed: 0,review_comment_message,review_score,label,enc_label
3,Recebi bem antes do prazo estipulado.,5,positive,2
4,Parabéns lojas lannister adorei comprar pela I...,5,positive,2
9,aparelho eficiente. no site a marca do aparelh...,4,positive,2
12,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",4,positive,2
15,"Vendedor confiável, produto ok e entrega antes...",5,positive,2


In [54]:
# Splitting dataset
X, y = reviews['review_comment_message'], reviews['enc_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
X_train

83597              entrega rapida atendimento maravilhoso 
32774             Sempre comprei e fui muito bem atendido.
74444    Ótimo meu produto foi entregue antes do prazo ...
7788     Facilidade para uso do app, facilidade de paga...
95842    Recebi o produto correto e antes mesmo da data...
                               ...                        
15109           Super interessante, chegou antes do prazo.
27007                           Ótima loja, ótimo produto.
92436    Achei que não receberia a tempo de mudar mas c...
2000     Primeira vez que compro pela loja virtual e nã...
38110    Na embalagem não vieram os parafusos de fixaçã...
Name: review_comment_message, Length: 32781, dtype: object

In [56]:
# Tokenizing the text, converting to lowercase, lemmatizing, removing stopwords
stop_words = set(stopwords.words('portuguese'))

def preprocess_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if token not in stop_words]
    return ' '.join(lemmas) 

X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

TF-IDF is good for text classification tasks because it emphasizes important words that are rare, helping classifiers focus on meaningful features for distinguishing between classes.

In [57]:
tfidf_vectorizer = TfidfVectorizer()

Let's try different models (Logistic Regression, Decision Tree, Random Forest) and use GridSearchCV to find better parameters.

In [35]:
# Initialiing classifiers, creating pipelines
lr_model = LogisticRegression(max_iter=1000)
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
lr_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', lr_model)
])

dt_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', dt_model)
])

rf_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', rf_model)
])

In [36]:
# Parameter grids for GridSearchCV
lr_param_grid = {
    'tfidf__max_features': [1000, 2000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1.0, 10.0]
}

dt_param_grid = {
    'tfidf__max_features': [1000, 2000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__max_depth': [None, 10, 20]
}

rf_param_grid = {
    'tfidf__max_features': [1000, 2000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20]
}

In [37]:
lr_grid_search = GridSearchCV(lr_pipeline, lr_param_grid, cv=5, verbose=2)
dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=5, verbose=2)
rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, verbose=2)

In [38]:
# Fiting the models
lr_grid_search.fit(X_train, y_train)
dt_grid_search.fit(X_train, y_train)
rf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   2.5s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.5s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.7s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.7s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.7s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.7s
[CV] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.7s
[CV] END 

[CV] END clf__max_depth=10, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.6s
[CV] END clf__max_depth=10, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.6s
[CV] END clf__max_depth=10, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.6s
[CV] END clf__max_depth=10, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.6s
[CV] END clf__max_depth=10, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   0.6s
[CV] END clf__max_depth=10, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END clf__max_depth=10, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END clf__max_depth=10, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END clf__max_depth=10, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END clf__max_depth=10, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   0.4s


[CV] END clf__max_depth=None, clf__n_estimators=200, tfidf__max_features=2000, tfidf__ngram_range=(1, 2); total time=  30.2s
[CV] END clf__max_depth=None, clf__n_estimators=200, tfidf__max_features=2000, tfidf__ngram_range=(1, 2); total time=  30.5s
[CV] END clf__max_depth=10, clf__n_estimators=100, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.9s
[CV] END clf__max_depth=10, clf__n_estimators=100, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.9s
[CV] END clf__max_depth=10, clf__n_estimators=100, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.8s
[CV] END clf__max_depth=10, clf__n_estimators=100, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.8s
[CV] END clf__max_depth=10, clf__n_estimators=100, tfidf__max_features=1000, tfidf__ngram_range=(1, 1); total time=   0.9s
[CV] END clf__max_depth=10, clf__n_estimators=100, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   1.1s
[CV] END clf

[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   3.5s
[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   3.6s
[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   3.6s
[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   3.5s
[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=1000, tfidf__ngram_range=(1, 2); total time=   3.6s
[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   2.5s
[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   2.5s
[CV] END clf__max_depth=20, clf__n_estimators=200, tfidf__max_features=2000, tfidf__ngram_range=(1, 1); total time=   2.5s
[CV] END clf__ma

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', RandomForestClassifier())]),
             param_grid={'clf__max_depth': [None, 10, 20],
                         'clf__n_estimators': [100, 200],
                         'tfidf__max_features': [1000, 2000],
                         'tfidf__ngram_range': [(1, 1), (1, 2)]},
             verbose=2)

In [41]:
print("\nLogistic Regression - Best parameters:")
print(lr_grid_search.best_params_)
best_lr_model = lr_grid_search.best_estimator_

print("\nDecision Tree - Best parameters:")
print(dt_grid_search.best_params_)
best_dt_model = dt_grid_search.best_estimator_

print("\nRandom Forest - Best parameters:")
print(rf_grid_search.best_params_)
best_rf_model = rf_grid_search.best_estimator_


Logistic Regression - Best parameters:
{'clf__C': 1.0, 'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 2)}

Decision Tree - Best parameters:
{'clf__max_depth': 20, 'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 2)}

Random Forest - Best parameters:
{'clf__max_depth': None, 'clf__n_estimators': 200, 'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 1)}


In [42]:
# Predicting and evaluating
lr_y_pred = best_lr_model.predict(X_test)
lr_y_pred_proba = best_lr_model.predict_proba(X_test)
dt_y_pred = best_dt_model.predict(X_test)
dt_y_pred_proba = best_dt_model.predict_proba(X_test)
rf_y_pred = best_rf_model.predict(X_test)
rf_y_pred_proba = best_rf_model.predict_proba(X_test)

accuracy_lr = accuracy_score(y_test, lr_y_pred)
accuracy_dt = accuracy_score(y_test, dt_y_pred)
accuracy_rf = accuracy_score(y_test, rf_y_pred)

lr_roc_auc = roc_auc_score(y_test, lr_y_pred_proba, multi_class='ovr', average='macro')
dt_roc_auc = roc_auc_score(y_test, dt_y_pred_proba, multi_class='ovr', average='macro')
rf_roc_auc = roc_auc_score(y_test, rf_y_pred_proba, multi_class='ovr', average='macro')

print(f"\nLogistic Regression ROC-AUC Score:{lr_roc_auc:.4f}")
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")

print(f"\nDecision Tree ROC-AUC Score:{dt_roc_auc:.4f}")
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")

print(f"\nRandom Forest ROC-AUC Score:{rf_roc_auc:.4f}")
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")


Logistic Regression ROC-AUC Score:0.8811
Logistic Regression Accuracy: 0.8349

Decision Tree ROC-AUC Score:0.7728
Decision Tree Accuracy: 0.7744

Random Forest ROC-AUC Score:0.8587
Random Forest Accuracy: 0.8283


Logistic Regression is showing best results. Let's use it to make a function that takes a text comment as input and classifies the semantics as positive, negative or neutral.

In [43]:
def review_semantics(review, model):
    
    """
    Predicts the sentiment of a given review text using the provided model pipeline.
    Input: str type review text.
    Output: The sentiment label of the review text ('positive', 'neutral', or 'negative')
    """
    
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('portuguese'))

    # Preprocessing the input review
    review = review.lower()
    review = re.sub(r'\W', ' ', review) 
    review = re.sub(r'\s+', ' ', review)
    tokens = review.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    text = ' '.join(tokens)

    # Predicting sentiment using the provided model
    text_series = pd.Series([text])
    review_pred = model.predict(text_series)

    # Returning sentiment label based on prediction
    if review_pred == 2:
        return 'positive'
    elif review_pred == 1:
        return 'neutral'
    else:
        return 'negative'

In [44]:
# Trying the function by feeding it review_comment_title of 5-score review
comment = 'Super recomendo' # translation: I highly recommend
review_semantics(comment, best_lr_model)

'positive'

In [45]:
# Trying the function by feeding it review_comment_title of 1-score review
comment = 'Não chegou meu produto '  # translation: My product didn't arrive
review_semantics(comment, best_lr_model)

'negative'