In [1]:
import pandas as pd
import numpy as np
from datasets import load_from_disk
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
dataset = load_from_disk("../CLIdata/datasets/cuneiform-spaced-indexed/")
sentences = dataset['train']['text'] + dataset['val']['text'] + dataset['test']['text']

In [3]:
uniqueTokens = set()
bags = []
for sent in sentences:
    bag = sent.split(' ')
    if '' in bag:
        [bag.remove('') for i in range(bag.count(''))]
    bags.append(bag)
    uniqueTokens = uniqueTokens.union(set(bag))
#uniqueTokens.remove('')

In [4]:
counts = [dict.fromkeys(uniqueTokens, 0) for i in range(len(sentences))]
for i,sent in enumerate(sentences):
    for char in bags[i]:
        counts[i][char] += 1

# Calculando TF

In [5]:
tfDicts = [dic.copy() for dic in counts] 

In [6]:
for i in range(len(tfDicts)):
    for key in tfDicts[i].keys():
        tfDicts[i][key] = counts[i][key]/len(bags[i])

In [7]:
dic = tfDicts[0]
for key in list(tfDicts[0].keys()):
    if dic[key] != 0:
        print(f'{key}:{dic[key]}')

𒀸:0.3333333333333333
𒆠:0.3333333333333333
𒋢:0.3333333333333333


# Calculando IDF

In [8]:
idfDicts = [dic.copy() for dic in counts] 

In [9]:
num_sents = len(sentences)
idf = dict.fromkeys(uniqueTokens, 0)
for i,token in enumerate(idf.keys()):
    count = 0
    for sent in sentences:
        if token in sent:
            count += 1
    idf[token] = np.log10(num_sents/count) 

# Juntando tudo

In [10]:
tfidf = [dict.fromkeys(uniqueTokens, 0) for i in range(len(sentences))]
for i in range(len(tfidf)):
    for key in tfidf[i].keys():
        tfidf[i][key] = tfDicts[i][key] * idf[key]

In [11]:
dic = tfidf[0]
for key in list(tfDicts[0].keys()):
    if dic[key] != 0:
        print(f'{key}:{dic[key]}')

𒀸:0.2887161556012158
𒆠:0.3038107944351745
𒋢:0.46026104855171657


In [12]:
pd.set_option('display.max_rows', 100)

In [13]:
df = pd.DataFrame(tfidf)

In [14]:
df['labels'] = dataset['train']['label'] + dataset['val']['label'] + dataset['test']['label']

In [15]:
train = df[0:len(dataset['train']['text'])]
val = df[len(train):len(train)+len(dataset['val']['text'])]
train_val = df[0:len(dataset['train']['text']) + len(dataset['val']['text'])]
test = df[len(train_val):len(train_val)+len(train_val)]

# Treinamento

In [16]:
log_reg = RandomForestClassifier(random_state=1)
log_reg.fit(train_val.iloc[:,:-1],train_val.iloc[:,-1])

In [17]:
pred = log_reg.predict(test.iloc[:,:-1])

In [18]:
target = test.iloc[:,-1]

In [19]:
average = 'macro'
print("Accuracy:",accuracy_score(target,pred))
print("F1-score:",f1_score(target,pred,average=average))
print("Precision:",precision_score(target,pred,average=average))
print("Recall:",recall_score(target,pred,average=average))

Accuracy: 0.566497461928934
F1-score: 0.550696463798644
Precision: 0.672381689416739
Recall: 0.566497461928934


In [20]:
average = 'macro'

In [21]:
accuracy_score(test.iloc[:,-1],pred)

0.566497461928934

In [86]:
f1_score(test.iloc[:,-1],pred,average=average)

0.44344617298626643

In [87]:
precision_score(test.iloc[:,-1],pred,average=average)

0.5385905893909128

In [88]:
recall_score(test.iloc[:,-1],pred,average=average)

0.4786076867295142