### Load data

In [2]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation
from lib.preprocessing_data import Preprocessing


data_set = DataPreparation().load_data()
data_set = Preprocessing().lemmatization(data_set)

data_set = data_set.sample(frac=0.005).reset_index(drop=True)
data_set

Unnamed: 0,id,sentiment,tweet,label,processed_tweet
0,1387,negative,punishment time for jbs team unhappy,-1.0,"[punishment, time, for, jbs, team, unhappy, ]"
1,1348,negative,i wish i could sleep early unhappy,-1.0,"[I, wish, I, could, sleep, early, unhappy]"
2,3298,neutral,bengal law that singles out may keep out inves...,0.0,"[bengal, law, that, single, out, may, keep, ou..."
3,1655,negative,theres always shit like this theres always peo...,-1.0,"[there, s, always, shit, like, this, there, s,..."
4,2144,negative,andy unhappy,-1.0,"[andy, unhappy, ]"
5,2049,negative,now i dont want to go camp unhappy,-1.0,"[now, I, do, not, want, to, go, camp, unhappy]"
6,1788,negative,jyoust hyoug me,-1.0,"[jyoust, hyoug, I]"
7,3526,neutral,other civic bodies in state to go to polls o...,0.0,"[ , other, civic, body, in, state, to, go, to..."
8,2733,neutral,a few reasons to celebrate the uncelebrated,0.0,"[a, few, reason, to, celebrate, the, uncelebra..."
9,2233,negative,in memoriam of alea starbridge unhappy your b...,-1.0,"[in, memoriam, of, alea, starbridge, unhappy, ..."


---
# Bag Of Words

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

import pandas as pd

processed_data = Preprocessing.tokenization(data_set)

text_data = processed_data['processed_tweets'].apply(lambda row: ' '.join(row))

bow_vector = vectorizer.fit_transform(text_data)


transformed_output = vectorizer.transform(text_data)
feature_names = vectorizer.get_feature_names_out()
dense_output = transformed_output.todense()
visulize_bow_vector = pd.DataFrame(
    dense_output, 
    columns=feature_names,
    index=text_data.index 
)

visulize_bow_vector


Unnamed: 0,alea,all,always,and,andy,are,at,be,beautiful,bengal,...,up,voice,want,when,who,will,wish,would,you,your
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 8))
plt.figure(figsize=(15, 8), facecolor='lightgray')
sns.heatmap(visulize_bow_vector, cmap="YlGnBu", annot=False, cbar=True)
plt.title("Bag-of-Words Model Heatmap")
plt.xlabel("Features (Words)")
plt.ylabel("Documents")
plt.show()

---
# 0 or 1, if the word exist (Binary/Count Vectorizer)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(binary=True)

import pandas as pd

processed_data = Preprocessing.tokenization(data_set)

text_data = processed_data['processed_tweets'].apply(lambda row: ' '.join(row))

binary_vector = vectorizer.fit_transform(text_data)

transformed_output = vectorizer.transform(text_data)
feature_names = vectorizer.get_feature_names_out()
dense_output = transformed_output.todense()
visulize_binary_vector = pd.DataFrame(
    dense_output, 
    columns=feature_names,
    index=text_data.index 
)

visulize_binary_vector


---
# Term Frequency-Inverse Document Frequency (TF-IDF)

In [None]:
vectorizer = TfidfVectorizer()

processed_data = Preprocessing.tokenization(data_set)
text_data = processed_data['processed_tweets'].apply(lambda row: ' '.join(row))

tf_idf_vector = vectorizer.fit_transform(text_data)

transformed_output = vectorizer.transform(text_data)
feature_names = vectorizer.get_feature_names_out()
dense_output = transformed_output.todense()
visulize_tf_idf_vector = pd.DataFrame(
    dense_output, 
    columns=feature_names,
    index=text_data.index 
)
visulize_tf_idf_vector


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 8))
plt.figure(figsize=(15, 8), facecolor='lightgray')
sns.heatmap(visulize_tf_idf_vector, cmap="YlGnBu", annot=False, cbar=True)
plt.title("tf-idf Model Heatmap")
plt.xlabel("Features (Words)")
plt.ylabel("Documents")
plt.show()