### Load data

In [31]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation
from lib.preprocessing_data import Preprocessing


data_set = DataPreparation.remove_stopwords(DataPreparation.load_data())
data_set = DataPreparation.clean_data(data_set)


data_set = data_set.sample(frac=0.005).reset_index(drop=True)
data_set

Unnamed: 0,positive,negative,neutral
0,thanks recent follow happy connect happy great...,damn touching unhappy,key minister comments freely
1,think better happy,hi tried call number got response unhappy plea...,akhilesh aunt
2,saying read book today,unhappy oh man,says bid enter
3,yes kind funny creepy time happy,talking driver im goinghe said hed love go new...,room india indian
4,joo okay happy,finals next week unhappy wish luck x,made trek km fetch water
5,top retweeters week,miss biking unhappy,air india enforce fines unruly passengers


---
# Bag Of Words

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

import pandas as pd

processed_data = Preprocessing.tokenization(data_set).T

text_data = processed_data.fillna('').apply(lambda row: ' '.join([' '.join(words) for words in row]), axis=1)

vectorizer.fit(text_data)

transformed_output = vectorizer.transform(text_data)
feature_names = vectorizer.get_feature_names_out()
dense_output = transformed_output.todense()
bow_vector = pd.DataFrame(
    dense_output, 
    columns=feature_names,
    index=text_data.index 
)


bow_vector


Unnamed: 0,air,akhilesh,alternate,another,aunt,better,bid,biking,book,call,...,trek,tried,trump,unhappy,unruly,water,week,wish,yes,york
positive,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
negative,0,0,1,1,0,0,0,1,0,1,...,0,1,1,5,0,0,1,1,0,1
neutral,1,1,0,0,1,0,1,0,0,0,...,1,0,0,0,1,1,0,0,0,0


In [None]:

word_frequencies = {}
for token_list in tokens:
    for token in token_list:
        word_frequencies[token] = word_frequencies.get(token, 0) + 1


bow_vectors = pd.concat([positive_bow, negative_bow, neutral_bow], axis=0)
bow_vectors.fillna(0, inplace=True)



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 8))
plt.figure(figsize=(15, 8), facecolor='lightgray')
sns.heatmap(bow_vector, cmap="YlGnBu", annot=False, cbar=True)
plt.title("Bag-of-Words Model Heatmap")
plt.xlabel("Features (Words)")
plt.ylabel("Documents")
plt.show()

---
# 0 or 1, if the word exist (Binary/Count Vectorizer)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(binary=True)

import pandas as pd

processed_data = Preprocessing.tokenization(data_set).T

text_data = processed_data.fillna('').apply(lambda row: ' '.join([' '.join(words) for words in row]), axis=1)

vectorizer.fit(text_data)

transformed_output = vectorizer.transform(text_data)
feature_names = vectorizer.get_feature_names_out()
dense_output = transformed_output.todense()
binary_vector = pd.DataFrame(
    dense_output, 
    columns=feature_names,
    index=text_data.index 
)

binary_vector


Unnamed: 0,air,akhilesh,alternate,another,aunt,better,bid,biking,book,call,...,trek,tried,trump,unhappy,unruly,water,week,wish,yes,york
positive,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
negative,0,0,1,1,0,0,0,1,0,1,...,0,1,1,1,0,0,1,1,0,1
neutral,1,1,0,0,1,0,1,0,0,0,...,1,0,0,0,1,1,0,0,0,0


---
# Term Frequency-Inverse Document Frequency (TF-IDF)

In [38]:
vectorizer = TfidfVectorizer()
processed_data = Preprocessing.tokenization(data_set).T

text_data = processed_data.fillna('').apply(lambda row: ' '.join([' '.join(words) for words in row]), axis=1)

vectorizer.fit(text_data)

transformed_output = vectorizer.transform(text_data)
feature_names = vectorizer.get_feature_names_out()
dense_output = transformed_output.todense()
tf_idf_vector = pd.DataFrame(
    dense_output, 
    columns=feature_names,
    index=text_data.index 
)

tf_idf_vector

Unnamed: 0,air,akhilesh,alternate,another,aunt,better,bid,biking,book,call,...,trek,tried,trump,unhappy,unruly,water,week,wish,yes,york
positive,0.0,0.0,0.0,0.0,0.0,0.147191,0.0,0.0,0.147191,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111943,0.0,0.147191,0.0
negative,0.0,0.0,0.123885,0.123885,0.0,0.0,0.0,0.123885,0.0,0.123885,...,0.0,0.123885,0.123885,0.619427,0.0,0.0,0.094218,0.123885,0.0,0.123885
neutral,0.2,0.2,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0
