In [450]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [451]:
nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\e-joaom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [452]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [453]:
df = pd.read_csv('C:\\Users\\e-joaom\\Downloads\\Brazilian News Database.csv', delimiter=';', encoding="Windows-1252")
df = df[df['status'] != 'Neutral'].reset_index(drop=True)
df['is_otimism'] = df['status'].map({'Otimism': 1, 'Fear': 0})
df.drop(columns=['status'], inplace=True)

In [454]:
from imblearn.over_sampling import RandomOverSampler

x = df[['title']]
y = df['is_otimism']

OverS = RandomOverSampler(random_state=123)
X_Over, Y_Over = OverS.fit_resample(x, y)

In [455]:
import plotly.express as px

fig = px.histogram(y.astype(bool))
fig.show()

In [456]:
fig = px.histogram(Y_Over.astype(bool))
fig.show()

In [457]:
title = [preprocess_text(text) for text in X_Over['title']]

In [458]:
vectorizer = CountVectorizer(min_df=0.016, max_df=0.5, lowercase=True)
vectorizer.fit(title)
vectorizer.transform(title).toarray()

len(vectorizer.vocabulary_)

118

In [459]:
sentences = X_Over['title'].values
labels = Y_Over.values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=123)

In [460]:
vectorizer = CountVectorizer(min_df=0.0, max_df=0.5, lowercase=True)
vectorizer.fit(sentences_train)

x_train = vectorizer.transform(sentences_train).toarray()
x_test = vectorizer.transform(sentences_test).toarray()

print(f'Number of words considered: {len(vectorizer.vocabulary_)}')

print(f'Number of train texts: {len(x_train)}')
print(f'Number of test texts: {len(x_test)}')

Number of words considered: 744
Number of train texts: 209
Number of test texts: 53


In [461]:
accuracies = []
errors = []

models = ['Logistic Regression', 'Random Forest', 'Bernoulli Naive Bayes', 'SVC', 'KNN']

In [462]:
from sklearn.metrics import accuracy_score, root_mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

param_grid = {'C': [0.01, 0.1, 1, 10, 100], 
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear', 'saga']}

grid = GridSearchCV(LogisticRegression(random_state=123), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_LR_model = grid.best_estimator_
y_pred = best_LR_model.predict(x_test)
print(best_LR_model)

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

LogisticRegression(C=10, random_state=123, solver='liblinear')
Model Accuracy: 0.9434
Model RMSE: 0.2379


In [463]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid = GridSearchCV(RandomForestClassifier(random_state=123), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_RF_model = grid.best_estimator_
y_pred = best_RF_model.predict(x_test)
print(best_RF_model)

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

RandomForestClassifier(random_state=123)
Model Accuracy: 0.9057
Model RMSE: 0.3071


In [464]:
from sklearn.naive_bayes import BernoulliNB

param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'binarize': [0, 0.5, 1]
}

grid = GridSearchCV(BernoulliNB(), param_grid, cv=2, scoring='accuracy')
grid.fit(x_train, y_train)

best_NB_model = grid.best_estimator_
y_pred = best_NB_model.predict(x_test)
print(best_NB_model)

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

BernoulliNB(alpha=1, binarize=0)
Model Accuracy: 0.9245
Model RMSE: 0.2747


In [465]:
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(SVC(random_state=123), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_SVC_model = grid.best_estimator_
y_pred = best_SVC_model.predict(x_test)
print(best_SVC_model)

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

SVC(C=1, random_state=123)
Model Accuracy: 0.9057
Model RMSE: 0.3071


In [466]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'metric': ['euclidean', 'manhattan', 'cosine'],
    'weights': ['uniform', 'distance']
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_KNN_model = grid.best_estimator_
y_pred = best_KNN_model.predict(x_test)
print(best_KNN_model)

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')
Model Accuracy: 0.9057
Model RMSE: 0.3071


In [467]:
accuracies = [round(accuracy, 4) for accuracy in accuracies]
errors = [round(rmse, 4) for rmse in errors]

df_metrics = pd.DataFrame({'Accuracy': accuracies, 'RMSE': errors}, index=models).sort_values(by='Accuracy', ascending=False)
df_metrics

Unnamed: 0,Accuracy,RMSE
Logistic Regression,0.9434,0.2379
Bernoulli Naive Bayes,0.9245,0.2747
Random Forest,0.9057,0.3071
SVC,0.9057,0.3071
KNN,0.9057,0.3071


In [468]:
input_text = 'Varejo tem queda mensal de 0,9% em vendas em fevereiro, diz Stone'
print(input_text)

input_text = preprocess_text(input_text)
print(input_text)

input_text = vectorizer.transform([input_text]).toarray()
print(input_text)

best_LR_model.predict(input_text)[0]

Varejo tem queda mensal de 0,9% em vendas em fevereiro, diz Stone
varejo queda mensal 0 9 vendas fevereiro diz stone
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0

In [469]:
words_considered_position = np.where(input_text == 1)[1]

print('Words considereds by the model:')
for word_position in words_considered_position:
    print(vectorizer.get_feature_names_out()[word_position])


Words considereds by the model:
diz
fevereiro
queda
varejo
vendas
