### Load data

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from src.preprocessing import Preprocessing
from src.data_cleaning import DataCleaning


data_frame = DataCleaning.load_data()

ModuleNotFoundError: No module named 'spacy'

---


----

# Bag of words (BoW)

----

In [None]:
def bag_of_words(data_frame: list) -> dict:
    bag_of_words = {}
    for data in data_frame:
        for word in data:
            if word in bag_of_words:
                bag_of_words[word] += 1
            else:
                bag_of_words[word] = 1
    return bag_of_words

In [None]:
import pandas as pd

def generate_bow_vectors(tokens_positive, tokens_negative, tokens_neutral):
    positive_bow = pd.DataFrame.from_dict(bag_of_words(tokens_positive), orient='index', columns=['Positive']).T
    negative_bow = pd.DataFrame.from_dict(bag_of_words(tokens_negative), orient='index', columns=['Negative']).T
    neutral_bow = pd.DataFrame.from_dict(bag_of_words(tokens_neutral), orient='index', columns=['Neutral']).T

    bow_vectors = pd.concat([positive_bow, negative_bow, neutral_bow], axis=0)
    bow_vectors.fillna(0, inplace=True)

    return bow_vectors


#### Bag of words with just tokenization

In [21]:

tokens_positive = Preprocessing.tokenization(data_frame["positive"])
tokens_negative = Preprocessing.tokenization(data_frame["negative"])
tokens_neutral = Preprocessing.tokenization(data_frame["neutral"])

bow_vectors = generate_bow_vectors(tokens_positive, tokens_negative, tokens_neutral)
bow_vectors.to_csv("bow_vectors.csv")

#### Bag of words with stemming

In [None]:
stemming_positive = Preprocessing.stemming(data_frame=data_frame["positive"])
stemming_negative = Preprocessing.stemming(data_frame=data_frame["negative"])
stemming_neutral = Preprocessing.stemming(data_frame=data_frame["neutral"])


bow_vectors_stemming = generate_bow_vectors(stemming_positive, stemming_negative, stemming_neutral)
bow_vectors_stemming

#### BoW with Lemmatization

In [None]:
lemmatization_positive = Preprocessing.lemmatization(data_frame=data_frame["positive"])
lemmatization_negative = Preprocessing.lemmatization(data_frame=data_frame["negative"])
lemmatization_neutral = Preprocessing.lemmatization(data_frame=data_frame["neutral"])

bow_vectors_lemmatization = generate_bow_vectors(lemmatization_positive, lemmatization_negative, lemmatization_neutral)
bow_vectors_lemmatization 

### Term Frequency-Inverse Document Frequency (TF-IDF)

In [None]:
def tf_idf(data_set):
    pass