# 벡터화 객체 구축

In [1]:
sentences = ["It was the best of times",
            'it was the worst of times',
            'it was the age of wisdom',
            'it was the age of foolishness']

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
vocabulary = set([w for s in tokenized_sentences for w in s])

import pandas as pd
pd.DataFrame([[w, i] for i,w in enumerate(vocabulary)])

Unnamed: 0,0,1
0,It,0
1,foolishness,1
2,the,2
3,was,3
4,times,4
5,worst,5
6,it,6
7,wisdom,7
8,age,8
9,of,9


In [2]:
vocabulary

{'It',
 'age',
 'best',
 'foolishness',
 'it',
 'of',
 'the',
 'times',
 'was',
 'wisdom',
 'worst'}

# 문서 벡터화

In [3]:
def onehot_encode(tokenized_sentence):
  return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence)
         for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
  print("%s: %s" % (oh, sentence))

[1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1]: It was the best of times
[0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0]: it was the worst of times
[0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0]: it was the age of wisdom
[0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0]: it was the age of foolishness


# 어휘에 없는 단어를 사용하는 문서

In [4]:
onehot_encode("the age of wisdom is the best of times".split())

[0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1]

# 사전에 포함되지 않은 단어만으로 문장을 벡터화

In [5]:
onehot_encode("John likes to watch movies. Mary likes movies too.".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# 문서-용어 행렬

In [6]:
df = pd.DataFrame(onehot, columns=list(vocabulary))  # list로 변환
print(df)

   It  foolishness  the  was  times  worst  it  wisdom  age  of  best
0   1            0    1    1      1      0   0       0    0   1     1
1   0            0    1    1      1      1   1       0    0   1     0
2   0            0    1    1      0      0   1       1    1   1     0
3   0            1    1    1      0      0   1       0    1   1     0


# 유사성 계산

In [7]:
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
sim

[0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0]

# 유사성 행렬

In [9]:
import numpy as np
np.dot(onehot, np.transpose(onehot))

array([[6, 4, 3, 3],
       [4, 6, 4, 4],
       [3, 4, 6, 5],
       [3, 4, 5, 6]])

# 사이킷런을 사용한 원-핫 인코딩

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()
vocabulary= lb.fit([vocabulary])
vocabulary

# 단어 가방 모델

# 사이킷런 CountVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [18]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games."]

In [19]:
cv.fit(more_sentences)

In [21]:
print(cv.get_feature_names_out())

['age' 'also' 'best' 'foolishness' 'football' 'games' 'it' 'john' 'likes'
 'mary' 'movies' 'of' 'the' 'times' 'to' 'too' 'was' 'watch' 'wisdom'
 'worst']


In [22]:
dt = cv.transform(more_sentences)

In [26]:
df=pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())
df

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


In [29]:
df.iloc[0,:]

Unnamed: 0,0
age,0
also,0
best,1
foolishness,0
football,0
games,0
it,1
john,0
likes,0
mary,0


## 유사성 계산
  - 문서 벡터 간의 각도를 유사성의 척도로 활용하는 방법

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [34]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.833333,0.666667,0.666667,0.0,0.0
1,0.833333,1.0,0.666667,0.666667,0.0,0.0
2,0.666667,0.666667,1.0,0.833333,0.0,0.0
3,0.666667,0.666667,0.833333,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.524142
5,0.0,0.0,0.0,0.0,0.524142,1.0


# TF-IDF모델
  - 자주 등장하는 단어의 가중치를 줄이는 동시에 흔하지 않은 단어의 가중치를 높인다.

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305609,0.501208,0.250604,0.611219,0.0,0.0,0.0,0.250604,0.305609,0.0,0.250604,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.343777,0.343777,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [37]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43076
5,0.0,0.0,0.0,0.0,0.43076,1.0


# 데이터 다운로드

In [39]:
import pandas as pd
import requests
# 데이터 다운로드 URL
url = "https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/abcnews/abcnews-date-text.csv.gz"

# 로컬 저장 경로
ABCNEWS_FILE = "abcnews-date-text.csv.gz"

# 데이터 다운로드
response = requests.get(url)
if response.status_code == 200:
    with open(ABCNEWS_FILE, "wb") as file:
        file.write(response.content)
    print(f"File downloaded successfully as {ABCNEWS_FILE}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

# CSV 파일 불러오기
try:
    headlines = pd.read_csv(ABCNEWS_FILE, parse_dates=["publish_date"])
    print(headlines.head())
except Exception as e:
    print(f"Error reading the CSV file: {e}")

File downloaded successfully as abcnews-date-text.csv.gz
  publish_date                                      headline_text
0   2003-02-19  aba decides against community broadcasting lic...
1   2003-02-19     act fire witnesses must be aware of defamation
2   2003-02-19     a g calls for infrastructure protection summit
3   2003-02-19           air nz staff in aust strike for pay rise
4   2003-02-19      air nz strike to affect australian travellers


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines['headline_text'])
dt

<1103663x95878 sparse matrix of type '<class 'numpy.float64'>'
	with 7001357 stored elements in Compressed Sparse Row format>

In [42]:
%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16913596,
        0.16792138],
       [0.        , 0.        , 0.        , ..., 0.16913596, 1.        ,
        0.33258708],
       [0.        , 0.        , 0.        , ..., 0.16792138, 0.33258708,
        1.        ]])

## 특성 차원 축소
  - 머신러닝 알고리즘은 계산 집약적이며 종종 계산 복잡도가 다항식으로 특성의 수에 따라 커진다. 따라서 특성을 실제로 필요한 특성으로 줄이는데 중점을 둔다

## 불용어 제거
  - 의미가 없는 단어를 제거

In [45]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
stopwords_list = list(stopwords)
print(len(stopwords))
tfidf = TfidfVectorizer(stop_words=stopwords_list)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

326




<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

# 최소빈도
  - 코사인 유사도의 정의를 살펴보면, 두 벡터가 해당 인덱스에서 값이 0이 아닌 경우에만 유사도에 반영됨을 쉽게 알 수 있다. 즉 한 번만 등장하는 단어를 모두 무시할 수 있다. 이때 사용하기 위해 min_df라는 매개변수가 있다.

In [46]:
tfidf = TfidfVectorizer(stop_words=stopwords_list, min_df=2)
dt = tfidf.fit_transform(headlines['headline_text'])
dt
# 5607113 갯수



<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

In [47]:
tfidf = TfidfVectorizer(stop_words=stopwords_list, min_df=0.0001)
dt = tfidf.fit_transform(headlines['headline_text'])
dt
# 4816381 갯수



<1103663x6772 sparse matrix of type '<class 'numpy.float64'>'
	with 4816381 stored elements in Compressed Sparse Row format>

In [49]:
tfidf = TfidfVectorizer(stop_words=stopwords_list, max_df=0.1)
dt = tfidf.fit_transform(headlines['headline_text'])
dt
# 5644186 갯수



<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

## 언어분석 수행
  - spacy는 모든 헤드라인을 원형으로 복원
  - 원형으로 복원 후 분석을 수행하면 어휘 사전을 더 작게 만들수 있다.

In [None]:
import spacy

nlp = spacy.load('en')
nouns_adjectives_verbs = ['NOUN','PROPN','ADJ','ADV','VERB']
