# EXTRAGERE CARACTERISTICI DIN TEXTE

## TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfIdf(trainingInput, validationInput, maxNumberOfFeatures=50):
    v = TfidfVectorizer(max_features=maxNumberOfFeatures)
    featuresForTraining = v.fit_transform(trainingInput)
    featuresForValidation = v.transform(validationInput)
    return featuresForTraining, featuresForValidation


## Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

def bagOfWords(trainingInput, validationInput):
    v = CountVectorizer()
    featuresForTraining = v.fit_transform(trainingInput)
    featuresForValidation = v.transform(validationInput)
    return featuresForTraining, featuresForValidation


## ALTE CARACTERISTICI

## Bert

In [8]:
from transformers import BertTokenizer, BertModel
import torch

def bertEmbeddings(trainingInput):
    allEmbeddings = []

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", force_download=True)
    model = BertModel.from_pretrained('bert-base-uncased', force_download=True)

    for input in trainingInput:
        input_ids = tokenizer.encode(input, add_special_tokens=True, max_length=128, truncation=True, return_tensors='pt')

        with torch.no_grad():
            outputs = model(input_ids)
            contextualEmbeddings = outputs.last_hidden_state

        allEmbeddings.append(contextualEmbeddings)
    return allEmbeddings


In [10]:
initial = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.."]
embedding = bertEmbeddings(initial)

for e,sentence in zip(embedding, initial):
    print(sentence)
    print(e)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement..
tensor([[[ 0.2849,  0.1456, -0.3721,  ...,  0.0334,  0.6419, -0.2261],
         [ 0.0092,  0.3518, -0.6721,  ..., -0.2276,  0.3777,  0.6179],
         [-0.0589,  0.1439, -0.2848,  ..., -0.3173,  0.0623,  0.2125],
         ...,
         [ 0.4903, -0.6535,  0.4232,  ...,  0.5028,  0.4519, -0.6911],
         [ 0.1742, -0.7943,  0.3039,  ...,  0.4251,  0.2958, -0.8385],
         [-0.2658,  0.3544, -0.0851,  ...,  0.6068, -0.6796, -0.6546]]])


## Stemming - taie sufixele de la cuvinte


In [12]:
import nltk
nltk.download('punkt')

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def stemming(trainingInput, validationInput):
    trainingTokens = [word_tokenize(text) for text in trainingInput]
    validationTokens = [word_tokenize(text) for text in validationInput]

    s = PorterStemmer()
    trainingInputStemmedWords = [[s.stem(word) for word in words] for words in trainingTokens]
    validationInputStemmedWords = [[s.stem(word) for word in words] for words in validationTokens]

    trainingInputStemmedSentences = [' '.join(words) for words in trainingInputStemmedWords]
    validationInputStemmedSentences = [' '.join(words) for words in validationInputStemmedWords]

    return trainingInputStemmedSentences, validationInputStemmedSentences


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
sentence1 = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.."]
sentence2 = ["I wanted to go on a ride last night but I was taking a shower."]

output1, output2 = stemming(sentence1, sentence2)
print("BEFORE")
print(sentence1)
print("AFTER")
print(output1)

print("BEFORE")
print(sentence2)
print("AFTER")
print(output2)

BEFORE
['By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement..']
AFTER
['by choos a bike over a car , i ’ m reduc my environment footprint . cycl promot eco-friendli transport , and i ’ m proud to be part of that movement ..']
BEFORE
['I wanted to go on a ride last night but I was taking a shower.']
AFTER
['i want to go on a ride last night but i wa take a shower .']


## Stop words removal - elimina cuvintele de legatura (cuvinte scurte)

In [18]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def stopWordsRemoval(trainingInput, validationInput):
    trainingTokens = [word_tokenize(text) for text in trainingInput]
    validationTokens = [word_tokenize(text) for text in validationInput]
    stopWords = set(stopwords.words('english'))

    trainingInputStemmedWords = [[word for word in words if word not in stopWords] for words in trainingTokens]
    validationInputStemmedWords = [[word for word in words if word not in stopWords] for words in validationTokens]

    trainingInputStemmedSentences = [' '.join(words) for words in trainingInputStemmedWords]
    validationInputStemmedSentences = [' '.join(words) for words in validationInputStemmedWords]

    return trainingInputStemmedSentences, validationInputStemmedSentences

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
sentence1 = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.."]
sentence2 = ["I wanted to go on a ride last night but I was taking a shower."]

output1, output2 = stopWordsRemoval(sentence1, sentence2)
print("BEFORE")
print(sentence1)
print("AFTER")
print(output1)

print("BEFORE")
print(sentence2)
print("AFTER")
print(output2)

BEFORE
['By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement..']
AFTER
['By choosing bike car , I ’ reducing environmental footprint . Cycling promotes eco-friendly transportation , I ’ proud part movement ..']
BEFORE
['I wanted to go on a ride last night but I was taking a shower.']
AFTER
['I wanted go ride last night I taking shower .']


#

# Etichetarea textelor cu emotii folosind kMeans (tool)

In [20]:
import pandas as pd
import numpy as np

def readData(filePath:str):
    df = pd.read_csv(filePath)
    return df

def getTrainingAndValidationSets(dataFrame, trainingSize=0.8):
    dataSize = dataFrame.shape[0]
    
    trainingIndexSet = np.random.choice(range(dataSize), size=int(trainingSize * dataSize), replace=False)
    validationIndexSet = [i for i in range(dataSize) if i not in trainingIndexSet]

    trainingInputSet = [dataFrame["Text"].iloc[index] for index in trainingIndexSet]
    trainingOutputSet = [dataFrame["Sentiment"].iloc[index] for index in trainingIndexSet]

    validationInputSet = [dataFrame["Text"].iloc[index] for index in validationIndexSet]
    validationOutputSet = [dataFrame["Sentiment"].iloc[index] for index in validationIndexSet]

    return trainingInputSet, trainingOutputSet, validationInputSet, validationOutputSet


def clasificationPerformance(ground_truth, computed_values, positive_label):
    """
    Returneaza TN (True Negative), FP(False Positive), FN(False Negative), TP(True Positive)
    """
    TN = 0
    FP = 0
    FN = 0
    TP = 0
    
    for i in range(0, len(ground_truth)):
        if ground_truth[i] == positive_label:
            if computed_values[i] == positive_label:
                TP += 1
            else:
                FP += 1
        else:
            if computed_values[i] != positive_label:
                TN += 1
            else:
                FN += 1
    return TN, FP, FN, TP


def getAccuracy(TN, FP, FN, TP):
    """ 
    accuracy represents the overall performance of classification model:
    (TP+TN)/(TN+FP+FN+TP)
    """
    if (TN+FP+FN+TP) == 0:
        return 0
    return (TP+TN)/(TN+FP+FN+TP)

def getPrecision(FP, TP):
    """
    precision indicates how accurate the positive predictions are 
    TP/(TP+FP)
    """
    if (TP+FP) == 0:
        return 0
    return TP/(TP+FP)

def getRecall(TP, FN):
    """ 
    recall indicates the coverage of actual positive sample
    TP/(TP+FN)
    """
    if (TP+FN) == 0:
        return 0
    return TP/(TP+FN)



In [22]:
from sklearn.cluster import KMeans

def getClassifier(trainingFeatures, numberClusters):
    clasificator = KMeans(n_clusters=numberClusters, random_state=0)
    clasificator.fit(trainingFeatures)
    return clasificator

def testClassifier(classifier, validationInput, validationOutput, labels, positiveLabel):
    computedIndexes = classifier.predict(validationInput)
    computedOutput = [labels[index] for index in computedIndexes]
    TN, FP, FN, TP = clasificationPerformance(validationOutput, computedOutput, positiveLabel)
    accuracy = getAccuracy(TN, FP, FN, TP)
    precision = getPrecision(FP, TP)
    recall = getRecall(TP, FN)
    print("Accuracy: {}\nPrecision: {}\nRecall: {}".format(accuracy, precision, recall))

In [None]:
datas = readData("reviews_mixed.csv")
trainingInputSet, trainingOutputSet, validationInputSet, validationOutputSet = getTrainingAndValidationSets(datas)
trainingFeatures, validationFeatures = bagOfWords(trainingInputSet, validationInputSet)
labels = [label for label in set(trainingOutputSet)]
clasificator = getClassifier(trainingFeatures, len(labels))
testClassifier(clasificator, validationFeatures, validationOutput, labels, 'positive')