In [1]:
import pandas as pd
data_ADU = pd.read_excel('OpArticles/OpArticles_ADUs.xlsx')

In [2]:
import re
from nltk.stem import RSLPStemmer
from nltk.corpus import stopwords

stopwords = stopwords.words("portuguese")
stemmer = RSLPStemmer()

corpus = []
for token in data_ADU["tokens"]:
    c_tk = re.sub('[^a-zA-Z\u00C0-\u00ff]', ' ', token.lower())
    c_tk = " ".join(stemmer.stem(w) for w in c_tk.split() if w == "não" or w not in stopwords)
    corpus.append(c_tk)

print(corpus[:3])
print(len(corpus))

['fact não apen frut ignor', 'hav hum jorn investig preocup aprofund contextual histór isenç relat preocup soc urg denunci muit peç real jorn', 'tud cómic fif']
16743


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def bowModel(corpus):
    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus).toarray()

def bigramModel(corpus):
    vectorizer = CountVectorizer(ngram_range=(2,2))
    return vectorizer.fit_transform(corpus).toarray()

def tfidfModel(corpus):
    vectorizer = TfidfVectorizer()
    return vectorizer.fit_transform(corpus).toarray()

def tfidfBiModel(corpus):
    vectorizer = TfidfVectorizer(ngram_range=(2,2))
    return vectorizer.fit_transform(corpus).toarray()

def tfidfUniBiModel(corpus):
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    return vectorizer.fit_transform(corpus).toarray()

def tfidfTriModel(corpus):
    vectorizer = TfidfVectorizer(ngram_range=(3,3))
    return vectorizer.fit_transform(corpus).toarray()

In [4]:
from sklearn.model_selection import train_test_split

def simpleSplit(X, y, test_size = 0.2):
    print(f"X -> {X.shape} | y -> {y.shape}")
    return train_test_split(X, y, test_size = test_size, random_state = 0, stratify=y)

In [5]:
from sklearn.naive_bayes import ComplementNB

X_train, X_test, y_train, y_test = simpleSplit(tfidfTriModel(corpus), data_ADU.label)
print("SPLIT!")
clfGNB = ComplementNB()
print("CREATE!")
clfGNB.fit(X_train, y_train)
print("FIT!")
y_pred = clfGNB.predict(X_test)
print("PREDICT!")
print(y_pred)

X_train, X_test, y_train, y_test = (None,) * 4

X -> (16743, 62278) | y -> (16743,)
SPLIT!
CREATE!
FIT!
PREDICT!
['Value' 'Value(-)' 'Policy' ... 'Policy' 'Value(+)' 'Fact']


In [6]:
from sklearn.naive_bayes import ComplementNB

X_train, X_test, y_train, y_test = simpleSplit(bigramModel(corpus), data_ADU.label)

clfNB = ComplementNB()
clfNB.fit(X_train, y_train)
y_pred = clfNB.predict(X_test)
print(y_pred)

X_train, X_test, y_train, y_test = (None,) * 4

X -> (16743, 61558) | y -> (16743,)
['Value' 'Value(-)' 'Policy' ... 'Policy' 'Value(+)' 'Fact']
