# BoW Matrix 구현
1. 모든 플랫폼에서 주로 사용되는 단어(불용어)를 추출: TF-IDF
2. 각 플랫폼에서 고유하게 사용되는 단어(jargon)를 추출: c-TF-IDF
3. 불용어와 jargon을 제외한 나머지 단어들로 BoW 행렬을 구성
4. matrix값 조정

In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
# Load datasets
twitter_jargon_df = pd.read_csv('text_preprocess.csv', lineterminator="\n")
twitter_jargon_df['content'].fillna('', inplace=True)
tweets_jargon = twitter_jargon_df['content'].sample(50000)

reddit_jargon_df = pd.read_csv('reddit_total_preprocessed_cleaned.csv')
reddit_jargon_df['preprocessed_text'].fillna('', inplace=True)
reddit_jargon = reddit_jargon_df['preprocessed_text'].sample(50000)

youtube_jargon_df = pd.read_csv('testVideoMetaDataResult_Pre.csv')
youtube_jargon_df['comment_Text'].fillna('', inplace=True)
youtubes_jargon = youtube_jargon_df['comment_Text'].sample(50000)


tweets_jargon_list = tweets_jargon.tolist()
reddit_jargon_list = reddit_jargon.tolist()
youtube_jargon_list = youtubes_jargon.tolist()

documents = tweets_jargon_list + reddit_jargon_list + youtube_jargon_list
labels = ["Twitter"] * len(tweets_jargon_list) + ["Reddit"] * len(reddit_jargon_list) + ["YouTube"] * len(youtube_jargon_list)

In [6]:
vectorizer = TfidfVectorizer(stop_words="english", lowercase=False, token_pattern=r'\b([A-Z]+|[a-z]+)\b')
tf_matrix = vectorizer.fit_transform(documents).toarray()  # we need array form for element-wise operations

A = len(documents)
f = np.count_nonzero(tf_matrix, axis=0)
W_matrix = tf_matrix * np.log(1 + A / (f**2))

twitter_data = W_matrix[:len(tweets_jargon_list)]
reddit_data = W_matrix[len(tweets_jargon_list):len(tweets_jargon_list) + len(reddit_jargon_list)]
youtube_data = W_matrix[len(tweets_jargon_list) + len(reddit_jargon_list):]

sum_twitter = np.squeeze(np.asarray(twitter_data.sum(axis=0)))
sum_reddit = np.squeeze(np.asarray(reddit_data.sum(axis=0)))
sum_youtube = np.squeeze(np.asarray(youtube_data.sum(axis=0)))

top_1000_twitter = [vectorizer.get_feature_names()[index] for index in sum_twitter.argsort()[-1000:][::-1]]
top_1000_reddit = [vectorizer.get_feature_names()[index] for index in sum_reddit.argsort()[-1000:][::-1]]
top_1000_youtube = [vectorizer.get_feature_names()[index] for index in sum_youtube.argsort()[-1000:][::-1]]

# print("Top 100 Twitter Jargons:", top_1000_twitter)
# print("Top 100 Reddit Jargons:", top_1000_reddit)
# print("Top 100 YouTube Jargons:", top_1000_youtube)


In [7]:
# stopwords
vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r'\b([A-Z]+|[a-z]+)\b')
tfidf_matrix = vectorizer.fit_transform(documents).toarray()
all_tfidf_values = np.sum(tfidf_matrix, axis=0)
top_n = 10000  # 상위 1000개의 단어만 불용어로 추출
stopwords_indices = all_tfidf_values.argsort()[-top_n:][::-1]
stopwords = [vectorizer.get_feature_names()[index] for index in stopwords_indices]

# platform jargons
jargons = top_1000_twitter + top_1000_reddit + top_1000_youtube

# BoW 행렬 구성
exclude_words = stopwords + jargons
bow_vectorizer = TfidfVectorizer(stop_words=exclude_words, lowercase=False, token_pattern=r'\b([A-Z]+|[a-z]+)\b')
bow_matrix = bow_vectorizer.fit_transform(documents).toarray()

tf_bow = np.sum(bow_matrix, axis=0)
f_bow = np.count_nonzero(bow_matrix, axis=0)

adjusted_bow_matrix = bow_matrix / (tf_bow * np.log(1 + A / (f_bow**2)))

In [8]:
print("Adjusted BoW Matrix Shape:", adjusted_bow_matrix.shape)

Adjusted BoW Matrix Shape: (150000, 72106)
