In [None]:
import ast
import os
import sys
import json
import copy
from glob import glob
from collections import Counter
sys.path.insert(0, '../')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
path = r'C:\Users\iloveslowfood\Desktop\iloveslowfood\etc\fonts\AppleSDGothicNeo\AppleSDGothicNeoSB.ttf'
font_name = fm.FontProperties(fname=path, size=50).get_name()
matplotlib.rcParams['axes.unicode_minus'] = False
plt.rc('font', family=font_name)
plt.style.use('ggplot')
print(font_name)

# custom
from modules.load_data import load
from modules.tf_idf import get_tfidf
from modules.utils import str2list, squeeze

In [None]:
PATH = "../raw/preprocessed/"
SIZE = 100000

In [None]:
metadata_core = pd.read_csv(os.path.join(PATH, 'metadata_vocab7000.csv'))
metadata_core['keyword_list'] = metadata_core['keyword_list'].apply(lambda x: str2list(x))
metadata_core.shape

In [None]:
vocab7000 = pd.read_csv(os.path.join(PATH, 'tag_vocab7000.csv'))
vocab = vocab7000['tag'].tolist()
len(vocab)

In [None]:
np.random.seed(42)
sample_indices = np.random.choice(a=[i for i in range(573039)], size=SIZE, replace=False).tolist()
user_id = metadata_core['user_id'].iloc[sample_indices].tolist()

In [None]:
metadata_tfidf = get_tfidf(metadata_core, vocab=vocab, indices=sample_indices)
metadata_tfidf.shape

In [None]:
pca = PCA(n_components=3, random_state=3)
pca.fit(metadata_tfidf)

In [None]:
metadata_pca = pd.DataFrame(pca.transform(metadata_tfidf), columns=['comp1', 'comp2', 'comp3'])
metadata_pca['user_id'] = user_id

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.gca(projection='3d')

x = metadata_pca['comp1']
y = metadata_pca['comp2']
z = metadata_pca['comp3']

ax.scatter(x, y, z)
ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')
# plt.tight_layout()
plt.title(f'글별 태그의 TF-IDF을 활용한 임베딩 결과 | 글 개수: {SIZE}')
plt.savefig(f'./plots/({SIZE})글별 태그의 TF-IDF을 활용한 임베딩 결과.png', dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 6))
sns.scatterplot(x='comp1', y='comp3', data=metadata_pca, ax=ax[0], alpha=.5)
sns.scatterplot(x='comp1', y='comp2', data=metadata_pca, ax=ax[1], alpha=.5)
sns.scatterplot(x='comp2', y='comp3', data=metadata_pca, ax=ax[2], alpha=.5)
plt.show()

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=6, random_state=42).fit(metadata_pca[['comp1', 'comp2', 'comp3']])

In [None]:
metadata_pca['group'] = kmeans.labels_

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 6))
sns.scatterplot(x='comp1', y='comp3', hue='group', data=metadata_pca, ax=ax[0], alpha=.5)
sns.scatterplot(x='comp1', y='comp2', hue='group', data=metadata_pca, ax=ax[1], alpha=.5)
sns.scatterplot(x='comp2', y='comp3', hue='group', data=metadata_pca, ax=ax[2], alpha=.5)
plt.show()

In [None]:
user_g0 = metadata_pca[metadata_pca['group'] == 0]['user_id'].tolist()
user_g1 = metadata_pca[metadata_pca['group'] == 1]['user_id'].tolist()
user_g2 = metadata_pca[metadata_pca['group'] == 2]['user_id'].tolist()
user_g3 = metadata_pca[metadata_pca['group'] == 3]['user_id'].tolist()
user_g4 = metadata_pca[metadata_pca['group'] == 4]['user_id'].tolist()
user_g5 = metadata_pca[metadata_pca['group'] == 5]['user_id'].tolist()

In [None]:
kwd_g0 = Counter(squeeze(metadata_core[metadata_core['user_id'].isin(user_g0)]['keyword_list'].tolist()))
kwd_g1 = Counter(squeeze(metadata_core[metadata_core['user_id'].isin(user_g1)]['keyword_list'].tolist()))
kwd_g2 = Counter(squeeze(metadata_core[metadata_core['user_id'].isin(user_g2)]['keyword_list'].tolist()))
kwd_g3 = Counter(squeeze(metadata_core[metadata_core['user_id'].isin(user_g3)]['keyword_list'].tolist()))
kwd_g4 = Counter(squeeze(metadata_core[metadata_core['user_id'].isin(user_g4)]['keyword_list'].tolist()))
kwd_g5 = Counter(squeeze(metadata_core[metadata_core['user_id'].isin(user_g5)]['keyword_list'].tolist()))

In [None]:
tag_freq_g0 = pd.Series(kwd_g0).to_frame('freq').reset_index().rename({'index': 'tag'}, axis=1).sort_values(by='freq', ascending=False)
tag_freq_g1 = pd.Series(kwd_g1).to_frame('freq').reset_index().rename({'index': 'tag'}, axis=1).sort_values(by='freq', ascending=False)
tag_freq_g2 = pd.Series(kwd_g2).to_frame('freq').reset_index().rename({'index': 'tag'}, axis=1).sort_values(by='freq', ascending=False)
tag_freq_g3 = pd.Series(kwd_g3).to_frame('freq').reset_index().rename({'index': 'tag'}, axis=1).sort_values(by='freq', ascending=False)
tag_freq_g4 = pd.Series(kwd_g4).to_frame('freq').reset_index().rename({'index': 'tag'}, axis=1).sort_values(by='freq', ascending=False)
tag_freq_g5 = pd.Series(kwd_g5).to_frame('freq').reset_index().rename({'index': 'tag'}, axis=1).sort_values(by='freq', ascending=False)

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(y='tag', x='freq', data=tag_freq_g0.head(15), palette='spring')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(y='tag', x='freq', data=tag_freq_g1.head(15), palette='summer')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(y='tag', x='freq', data=tag_freq_g3.head(15), palette='winter')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(y='tag', x='freq', data=tag_freq_g4.head(15), palette='Reds_r')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(y='tag', x='freq', data=tag_freq_g5.head(15), palette='Blues_r')
plt.show()