### Load data

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from src.preprocessing import Preprocessing
from src.data_cleaning import DataCleaning


data_set = DataCleaning.load_data()
sentiments = ['positive', 'negative', 'neutral']

[nltk_data] Downloading package words to /home/codespace/nltk_data...
[nltk_data]   Package words is already up-to-date!


---


----

# Bag of words (BoW)

----

In [2]:
import pandas as pd

def _generate_word_frequencies(tokens):
    word_frequencies = {}
    for token_list in tokens:
        for token in token_list:
            word_frequencies[token] = word_frequencies.get(token, 0) + 1
    return word_frequencies

def bag_of_words(tokens_positive, tokens_negative, tokens_neutral) -> dict:
    positive_bow = pd.DataFrame.from_dict(_generate_word_frequencies(tokens_positive), orient='index', columns=['Positive']).T
    negative_bow = pd.DataFrame.from_dict(_generate_word_frequencies(tokens_negative), orient='index', columns=['Negative']).T
    neutral_bow = pd.DataFrame.from_dict(_generate_word_frequencies(tokens_neutral), orient='index', columns=['Neutral']).T

    bow_vectors = pd.concat([positive_bow, negative_bow, neutral_bow], axis=0)
    bow_vectors.fillna(0, inplace=True)

    return bow_vectors

In [79]:
def test_BoW(vectorization_technique, data_set):
    tokens_positive = vectorization_technique(data_set["positive"])
    tokens_negative = vectorization_technique(data_set["negative"])
    tokens_neutral = vectorization_technique(data_set["neutral"])

    bow_vectors = bag_of_words(tokens_positive, tokens_negative, tokens_neutral)
    print(bow_vectors)
    # bow_vectors.to_csv("bow_vectors.csv")

#### Bag of words with just tokenization

In [77]:
test_BoW(Preprocessing.tokenization, data_set)

#### Bag of words with stemming

In [None]:
test_BoW(Preprocessing.stemming, data_set)

#### BoW with Lemmatization

In [None]:
test_BoW(Preprocessing.lemmatization, data_set)


# Term Frequency-Inverse Document Frequency (TF-IDF)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tf_idf(data_set, tokenizer=None):
    vectorizer = TfidfVectorizer()
    transformed_output = vectorizer.fit_transform(data_set)
    feature_names = vectorizer.get_feature_names_out()
    dense_output = transformed_output.todense()
    df = pd.DataFrame(dense_output, columns=feature_names)
    return df

In [5]:


# for sentiment in sentiments:
#     print(tf_idf(data_set[sentiment]))
import pandas as pd
    
def test_TfIdf(tokenization_technique):
    for sentiment in sentiments:
        if tokenization_technique:
            tokenization = tokenization_technique(data_set[sentiment])
        # Flatten each list of tokens into a single string ! unnecessary for just tokenization
        tokenization_strings = [' '.join(tokens) for tokens in tokenization]
        tokenization_tfidf = tf_idf(tokenization_strings)
        print(tokenization_tfidf)
        


### TF-IDF whith just Tokenization

In [6]:

test_TfIdf(Preprocessing.tokenization)

      00am   07  0xx   10  100  1000  100m  10am  10th  10usd  ...  young  \
0      0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
1      0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
2      0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
3      0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
4      0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
...    ...  ...  ...  ...  ...   ...   ...   ...   ...    ...  ...    ...   
1181   0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
1182   0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
1183   0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
1184   0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   
1185   0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   0.0    0.0  ...    0.0   

      younger  your  yours  yourself  youth  youtube  yoyour  yoyoyou  yura

### TF-IDF whith stemming

In [None]:
test_TfIdf(Preprocessing.stemming)

### TF-IDF whith lemmatization

In [None]:
test_TfIdf(Preprocessing.lemmatization)


# 0 or 1, if the word exist (Binary/Count Vectorizer)

In [81]:

def _binary_vectorization(tokens):
    word_presence = {}
    for token_list in tokens:
        for token in token_list:
            if token not in word_presence:
                word_presence[token] = 1
    return word_presence

def create_binary_vectors(positive_tokens, negative_tokens, neutral_tokens):
    positive_vectors = pd.DataFrame.from_dict(_binary_vectorization(positive_tokens), orient='index', columns=['Positive']).T
    negative_vectors = pd.DataFrame.from_dict(_binary_vectorization(negative_tokens), orient='index', columns=['Negative']).T
    neutral_vectors = pd.DataFrame.from_dict(_binary_vectorization(neutral_tokens), orient='index', columns=['Neutral']).T

    combined_vectors = pd.concat([positive_vectors, negative_vectors, neutral_vectors], axis=0)
    combined_vectors.fillna(0, inplace=True)

    return combined_vectors
    

In [82]:

def test_binary_vectorization(data_set, tokenization_technique):
    positive_tokens = tokenization_technique(data_set["positive"])
    negative_tokens = tokenization_technique(data_set["negative"])
    neutral_tokens = tokenization_technique(data_set["neutral"])

    binary_vectors = create_binary_vectors(positive_tokens, negative_tokens, neutral_tokens)
    print(binary_vectors)
    # binary_vectors.to_csv("bow_vectors.csv")


In [None]:
test_binary_vectorization(data_set=data_set, tokenization_technique=Preprocessing.tokenization)


In [None]:
test_binary_vectorization(data_set=data_set, tokenization_technique=Preprocessing.stemming)


In [None]:
test_binary_vectorization(data_set=data_set, tokenization_technique=Preprocessing.lemmatization)


# ddfdf
