In [174]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [175]:
nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\e-joaom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [176]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [177]:
df = pd.read_csv('C:\\Users\\e-joaom\\Downloads\\Brazilian News Database.csv', delimiter=';', encoding="Windows-1252")
df = df[df['status'] != 'Neutral'].reset_index(drop=True)
df['is_otimism'] = df['status'].map({'Otimism': 1, 'Fear': 0})
df.drop(columns=['status'], inplace=True)

In [178]:
df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y")
df

Unnamed: 0,date,journal,title,text,is_otimism
0,2025-01-22,CNN Brasil,Construir nova refinaria da Petrobras é questã...,O Brasil precisa de uma nova refinaria de petr...,0
1,2025-01-22,CNN Brasil,"Proporção de famílias endividadas cai a 76,7% ...",As famílias brasileiras ficaram menos inadimpl...,1
2,2025-01-22,CNN Brasil,Ministro: passagens não devem subir após fusão...,"O ministro de Portos e Aeroportos, Silvio Cost...",1
3,2025-01-22,CNN Brasil,"Superação de renda tira 1,3 milhão de famílias...","Mais de 1,3 milhão de famílias deixaram o Bols...",1
4,2025-01-22,CNN Brasil,3 em cada 5 profissionais planeja buscar novo ...,Os profissionais brasileiros estão em busca de...,1
...,...,...,...,...,...
191,2025-02-17,InfoMoney Brasil,Turistas argentinos aproveitam câmbio valoriza...,A volta das cenas de praias brasileiras lotada...,1
192,2025-02-17,InfoMoney Brasil,Trump anuncia novas tarifas sobre automóveis p...,O presidente Donald Trump afirmou que vai anun...,0
193,2025-02-17,InfoMoney Brasil,Galípolo: BC tem ferramentas para perseguir me...,SÃO PAULO (Reuters) – O presidente do Banco Ce...,1
194,2025-02-17,InfoMoney Brasil,Galípolo: governo Trump pode ser menos prejudi...,"O presidente do Banco Central, Gabriel Galípol...",0


In [179]:
otimism_count_by_date_df = df.groupby(['date', 'is_otimism']).size().reset_index(name='count')
otimism_count_by_date_df['is_otimism'] = otimism_count_by_date_df['is_otimism'].astype(bool)
otimism_count_by_date_df['percentage'] = round(otimism_count_by_date_df['count'] / otimism_count_by_date_df.groupby('date')['count'].transform('sum'), 4)
otimism_count_by_date_df

Unnamed: 0,date,is_otimism,count,percentage
0,2025-01-22,False,8,0.6154
1,2025-01-22,True,5,0.3846
2,2025-01-25,False,9,0.8182
3,2025-01-25,True,2,0.1818
4,2025-01-26,False,9,0.6429
5,2025-01-26,True,5,0.3571
6,2025-02-04,False,8,0.6667
7,2025-02-04,True,4,0.3333
8,2025-02-05,False,8,0.7273
9,2025-02-05,True,3,0.2727


In [180]:
import plotly.express as px

fig = px.bar(otimism_count_by_date_df, x='date', y='percentage', color='is_otimism',
             title='News Distribution by Date',
             labels={'date': 'Date', 'percentage': 'News Distribution'})

fig.show()

In [181]:
from imblearn.over_sampling import RandomOverSampler

x = df[['title']]
y = df['is_otimism']

OverS = RandomOverSampler(random_state=123)
X_Over, Y_Over = OverS.fit_resample(x, y)

In [182]:
fig = px.histogram(y.astype(bool), title='News Distribution - Optimistic or Not', labels={'value': 'Optimistic'})
fig.show()

In [183]:
fig = px.histogram(Y_Over.astype(bool), title='News Distribution - Optimistic or Not', labels={'value': 'Optimistic'})
fig.show()

In [184]:
title = [preprocess_text(text) for text in X_Over['title']]

In [185]:
vectorizer = CountVectorizer(min_df=0.016, max_df=0.5, lowercase=True)
vectorizer.fit(title)
vectorizer.transform(title).toarray()

len(vectorizer.vocabulary_)

118

In [186]:
sentences = X_Over['title'].values
labels = Y_Over.values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=123)

In [187]:
vectorizer = CountVectorizer(min_df=0.0, max_df=0.5, lowercase=True)
vectorizer.fit(sentences_train)

x_train = vectorizer.transform(sentences_train).toarray()
x_test = vectorizer.transform(sentences_test).toarray()

print(f'Number of words considered: {len(vectorizer.vocabulary_)}')

print(f'Number of train texts: {len(x_train)}')
print(f'Number of test texts: {len(x_test)}')

Number of words considered: 744
Number of train texts: 209
Number of test texts: 53


In [188]:
accuracies = []
errors = []

models = ['Logistic Regression', 'Random Forest', 'Bernoulli Naive Bayes', 'SVC', 'KNN']

In [189]:
from sklearn.metrics import accuracy_score, root_mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

param_grid = {'C': [0.01, 0.1, 1, 10, 100], 
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear', 'saga']}

grid = GridSearchCV(LogisticRegression(random_state=123), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_LR_model = grid.best_estimator_
y_pred = best_LR_model.predict(x_test)
print(f'{best_LR_model}\n')

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

LogisticRegression(C=10, random_state=123, solver='liblinear')

Model Accuracy: 0.9434
Model RMSE: 0.2379


In [190]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid = GridSearchCV(RandomForestClassifier(random_state=123), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_RF_model = grid.best_estimator_
y_pred = best_RF_model.predict(x_test)
print(f'{best_RF_model}\n')

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

RandomForestClassifier(random_state=123)

Model Accuracy: 0.9057
Model RMSE: 0.3071


In [191]:
from sklearn.naive_bayes import BernoulliNB

param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'binarize': [0, 0.5, 1]
}

grid = GridSearchCV(BernoulliNB(), param_grid, cv=2, scoring='accuracy')
grid.fit(x_train, y_train)

best_NB_model = grid.best_estimator_
y_pred = best_NB_model.predict(x_test)
print(f'{best_NB_model}\n')

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

BernoulliNB(alpha=1, binarize=0)

Model Accuracy: 0.9245
Model RMSE: 0.2747


In [192]:
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(SVC(random_state=123), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_SVC_model = grid.best_estimator_
y_pred = best_SVC_model.predict(x_test)
print(f'{best_SVC_model}\n')

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

SVC(C=1, random_state=123)

Model Accuracy: 0.9057
Model RMSE: 0.3071


In [193]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'metric': ['euclidean', 'manhattan', 'cosine'],
    'weights': ['uniform', 'distance']
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_KNN_model = grid.best_estimator_
y_pred = best_KNN_model.predict(x_test)
print(f'{best_KNN_model}\n')

accuracy = accuracy_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Model RMSE: {rmse:.4f}')

accuracies.append(accuracy)
errors.append(rmse)

KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')

Model Accuracy: 0.9057
Model RMSE: 0.3071


In [194]:
accuracies = [round(accuracy, 4) for accuracy in accuracies]
errors = [round(rmse, 4) for rmse in errors]

df_metrics = pd.DataFrame({'Accuracy': accuracies, 'RMSE': errors}, index=models).sort_values(by='Accuracy', ascending=False)
df_metrics

Unnamed: 0,Accuracy,RMSE
Logistic Regression,0.9434,0.2379
Bernoulli Naive Bayes,0.9245,0.2747
Random Forest,0.9057,0.3071
SVC,0.9057,0.3071
KNN,0.9057,0.3071


In [195]:
input_text = 'Varejo tem queda mensal de 0,9% em vendas em fevereiro, diz Stone'
print(f'{input_text}\n')

input_text = preprocess_text(input_text)
print(print(f'{input_text}\n'))

input_text = vectorizer.transform([input_text]).toarray()
print(f'{input_text}\n')

if best_LR_model.predict(input_text)[0] == 1:
    print('Optimistic')
else:
    print('Pessimistic')

Varejo tem queda mensal de 0,9% em vendas em fevereiro, diz Stone

varejo queda mensal 0 9 vendas fevereiro diz stone

None
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [196]:
words_considered_position = np.where(input_text == 1)[1]

print('Words considereds by the model:\n')

for word_position in words_considered_position:
    print(vectorizer.get_feature_names_out()[word_position])


Words considereds by the model:

diz
fevereiro
queda
varejo
vendas
