-
Notifications
You must be signed in to change notification settings - Fork 0
/
eda.py
170 lines (132 loc) · 5.01 KB
/
eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import SparsePCA, PCA
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from collections import Counter
import re
genre_list = ['christian', "country-music", "hip-hop-rap", "pop", "rhythm-blues", "rock"]
def preprocessing():
# data load
x = []
y = []
xAppend = x.append
yAppend = y.append
for i in range(len(genre_list)):
song_list = os.listdir("./lyrics/"+genre_list[i])
print(len(song_list))
for j in range(len(song_list)):
f = open("./lyrics/"+genre_list[i]+"/"+song_list[j], "r", encoding="UTF8")
data = f.read().replace("\n", " ") # 여러 줄로 나뉘어있는 가사를 한줄로 변환
xAppend(data)
yAppend(i)
for i in range(len(x)): # x 문장 토큰화
x[i] = sent_tokenize(x[i])
normalized_x = []
for i in range(len(x)): # 구두점 제거, 대문자를 소문자로 변환
tokens = re.sub(r"[^a-z0-9]+", " ", str(x[i]).lower())
normalized_x.append(tokens)
x_tokenized = [word_tokenize(sentence) for sentence in normalized_x] # x 단어 토큰화
x = []
stop_words = set(stopwords.words("english")) # 불용어 제거
for string in x_tokenized:
not_stoped = []
for word in string:
if word not in stop_words:
not_stoped.append(word)
x.append(not_stoped)
return x,y
x,y = preprocessing()
def w2v_visualising(x): # 종교에 관한 워드가 많이 보임
w2v = Word2Vec(sentences=x,min_count=2, workers=-1,)
mpl.rcParams['axes.unicode_minus'] = False
model = w2v
vocab = list(model.wv.vocab)
print(vocab[:20])
X = model[vocab]
tsne = TSNE(n_components=2)
print(len(x))
# 100개의 단어에 대해서만 시각화
X_tsne = tsne.fit_transform(X[:100,:])
df = pd.DataFrame(X_tsne, index=vocab[:100], columns=['x', 'y'])
fig = plt.figure()
fig.set_size_inches(40, 20)
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
ax.annotate(word, pos, fontsize=10)
plt.show()
return w2v
w2v = w2v_visualising(x)
def frequent_words(x):
# 장르별 구분
x_0 = x[:78]
x_1 = x[78:164]
x_2 = x[164:252]
x_3 = x[252:350]
x_4 = x[350:436]
x_5 = x[436:]
x_list = [x_0,x_1,x_2,x_3,x_4,x_5]
vocab = Counter() # 단어 빈도를 쉽게 세주는 모듈
sentences = []
n = 0
for genre in x_list:
vocab.clear()
sentences.clear()
for string in genre:
result = []
for word in string:
if len(word) >2:
result.append(word)
vocab[word] +=+1
sentences.append(result)
print(genre_list[n],"의 빈도수 : ",vocab)
n+=1
vocab.clear()
sentences.clear()
# 장르 구분 없이 전체
for string in x:
result = []
for word in string:
if len(word) >2:
result.append(word)
vocab[word] +=+1
sentences.append(result)
print("총 단어 빈도수 : ",vocab)
frequent_words(x)
def kmeans_pca(x):
vect = CountVectorizer()
for i in range(len(x)):
x[i] = str(x[i])
x_vect = vect.fit_transform(x)
kmeans = KMeans(n_clusters=3, init="random", random_state=0) # init의 기본값인 k-means++는 문서 클러스터링에서 좋지 않다고 함
kmeans.fit(x_vect)
idx = list(kmeans.fit_predict(x))
names = w2v.wv.index2word
print(len(kmeans.labels_))
DF = pd.DataFrame(data=x_vect)
DF["target"] = y
DF["cluster"]= kmeans.labels_
print(kmeans.labels_)
print(DF.groupby(['target','cluster'])[0].count()) # 그루핑 결과 특정 타겟으로 쏠림 현상이 나옴
pca = SparsePCA(n_components=2, random_state=0, n_jobs=-1, verbose=1) # 텍스트가 희소 행렬로 나타나기 때문에 sparse 메소드 사용
#pca = PCA(n_components=2, random_state=0)
pca_transformed = pca.fit_transform(x_vect.toarray())
DF["pca_x"] = pca_transformed[:,0]
DF["pca_y"] = pca_transformed[:,1]
marker0_ind = DF[DF['cluster']==0].index
marker1_ind = DF[DF['cluster']==1].index
marker2_ind = DF[DF['cluster']==2].index
plt.scatter(x=DF.loc[marker0_ind, 'pca_x'], y=DF.loc[marker0_ind,'pca_y'], marker='o')
plt.scatter(x=DF.loc[marker1_ind, 'pca_x'], y=DF.loc[marker1_ind,'pca_y'], marker='v')
plt.scatter(x=DF.loc[marker2_ind, 'pca_x'], y=DF.loc[marker2_ind,'pca_y'], marker='^')
plt.xlabel("PCA1")
plt.xlabel("PCA2")
plt.show()
kmeans_pca(x)