In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!python --version

Python 3.7.12


In [6]:
import sklearn

sklearn.__version__

'0.23.2'

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 텍스트 데이터 준비
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog.",
    "The dog is very lazy.",
    "A quick brown fox is quick."
]

In [7]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))
count_matrix = count_vectorizer.fit_transform(documents)

print("--- CountVectorizer 결과 ---")
print("단어 사전 (Vocabulary):", count_vectorizer.get_feature_names())
print("변환된 행렬 (Sparse Matrix):\n", count_matrix.toarray())
print("변환된 행렬 shape:", count_matrix.shape)

--- CountVectorizer 결과 ---
단어 사전 (Vocabulary): ['brown', 'dog', 'fox', 'jump', 'jumps', 'lazy', 'quick']
변환된 행렬 (Sparse Matrix):
 [[1 1 1 0 1 1 1]
 [0 1 0 1 0 1 0]
 [0 1 0 0 0 1 0]
 [1 0 1 0 0 0 2]]
변환된 행렬 shape: (4, 7)


In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("\n--- TfidfVectorizer 결과 ---")
print("단어 사전 (Vocabulary):", tfidf_vectorizer.get_feature_names())
print("변환된 행렬 (Sparse Matrix):\n", tfidf_matrix.toarray())
print("변환된 행렬 shape:", tfidf_matrix.shape)


--- TfidfVectorizer 결과 ---
단어 사전 (Vocabulary): ['brown', 'dog', 'fox', 'jump', 'jumps', 'lazy', 'quick']
변환된 행렬 (Sparse Matrix):
 [[0.41101031 0.33274827 0.41101031 0.         0.52131446 0.33274827
  0.41101031]
 [0.         0.47380449 0.         0.74230628 0.         0.47380449
  0.        ]
 [0.         0.70710678 0.         0.         0.         0.70710678
  0.        ]
 [0.40824829 0.         0.40824829 0.         0.         0.
  0.81649658]]
변환된 행렬 shape: (4, 7)


In [11]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.spatial.distance import jaccard
import numpy as np

# 코사인 유사도 (기본)
cosine_sim = cosine_similarity(tfidf_matrix)
# 개별 비교 시 아래처럼 작성 가능 ([[0.31531524]])
print(cosine_similarity(tfidf_matrix[0], tfidf_matrix[1]))

# 유클리디언 거리 → 유사도로 변환 (1 / (1 + 거리))
euclidean_dist = euclidean_distances(tfidf_matrix)
euclidean_sim = 1 / (1 + euclidean_dist)

# 맨하탄 거리 → 유사도로 변환
manhattan_dist = manhattan_distances(tfidf_matrix)
manhattan_sim = 1 / (1 + manhattan_dist)

# 자카드 유사도 (TF-IDF는 연속값이므로 주로 binary로 변환 후 사용)
binary_matrix = (tfidf_matrix > 0).astype(int)
n_docs = len(documents)
jaccard_sim = np.zeros((n_docs, n_docs))

for i in range(n_docs):
    for j in range(n_docs):
        jaccard_sim[i, j] = 1 - jaccard(binary_matrix[i], binary_matrix[j])  # 1 - 거리 = 유사도

# 결과 출력
print("=== Cosine Similarity ===\n", np.round(cosine_sim, 3))
'''
[[1.    0.315 0.471 0.671]
 [0.315 1.    0.67  0.   ]
 [0.471 0.67  1.    0.   ]
 [0.671 0.    0.    1.   ]]
'''
print("\n=== Euclidean Similarity ===\n", np.round(euclidean_sim, 3))
'''
[[1.    0.461 0.493 0.552]
 [0.461 1.    0.552 0.414]
 [0.493 0.552 1.    0.414]
 [0.552 0.414 0.414 1.   ]]
'''
print("\n=== Manhattan Similarity ===\n", np.round(manhattan_sim, 3))
'''
[[1.    0.265 0.285 0.385]
 [0.265 1.    0.453 0.231]
 [0.285 0.453 1.    0.247]
 [0.385 0.231 0.247 1.   ]]
'''
print("\n=== Jaccard Similarity ===\n", np.round(jaccard_sim, 3))

[[0.31531524]]
=== Cosine Similarity ===
 [[1.    0.315 0.471 0.671]
 [0.315 1.    0.67  0.   ]
 [0.471 0.67  1.    0.   ]
 [0.671 0.    0.    1.   ]]

=== Euclidean Similarity ===
 [[1.    0.461 0.493 0.552]
 [0.461 1.    0.552 0.414]
 [0.493 0.552 1.    0.414]
 [0.552 0.414 0.414 1.   ]]

=== Manhattan Similarity ===
 [[1.    0.265 0.285 0.385]
 [0.265 1.    0.453 0.231]
 [0.285 0.453 1.    0.247]
 [0.385 0.231 0.247 1.   ]]

=== Jaccard Similarity ===
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. 데이터 준비 (예시: 간단한 감성 데이터)
# 실제 데이터는 더 많은 양과 다양한 감성 표현을 포함해야 함
texts = [
    "이 영화 정말 최고다! 강력 추천합니다.", # 긍정
    "시간 낭비였어요. 너무 지루하고 재미없네요.", # 부정
    "그냥 볼만했어요. 특별히 좋지도 나쁘지도 않아요.", # 중립
    "인생 영화 등극! 다시 보고 싶어요.", # 긍정
    "최악의 경험. 돈 아까워요.", # 부정
    "나쁘지 않은데, 기대했던 것보다는 별로였어요.", # 부정 (중립에 가까움)
    "딱 평범하게 볼만한 것 같아요." # 중립
]
labels = [1, 0, 2, 1, 0, 0, 2] # 1: 긍정, 0: 부정, 2: 중립

df_sentiment = pd.DataFrame({'text': texts, 'sentiment': labels})

# 2. 텍스트 전처리 (간단화) 및 벡터화
# 실제로는 토큰화, 불용어 제거 등 더 복잡한 전처리 필요
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_vec = vectorizer.fit_transform(df_sentiment['text'])
y_sent = df_sentiment['sentiment']

X_train_sent, X_test_sent, y_train_sent, y_test_sent = train_test_split(X_vec, y_sent, test_size=0.3, random_state=42, stratify=y_sent)

# 3. 모델 학습 (로지스틱 회귀)
sentiment_model = LogisticRegression(random_state=42, max_iter=1000)
sentiment_model.fit(X_train_sent, y_train_sent)

# 4. 예측 및 평가
y_pred_sent = sentiment_model.predict(X_test_sent)
print("--- 감성 분석 모델 평가 ---")
print(f"정확도: {accuracy_score(y_test_sent, y_pred_sent):.3f}")
print("분류 리포트:\n", classification_report(y_test_sent, y_pred_sent))

--- 감성 분석 모델 평가 ---
정확도: 0.333
분류 리포트:
               precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.11      0.33      0.17         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
import re
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# 1. 텍스트 데이터 준비
documents_lda = [
    "machine learning is a field of artificial intelligence",
    "deep learning is a subset of machine learning",
    "natural language processing uses machine learning techniques",
    "computer vision is another field of artificial intelligence",
    "data science combines statistics and machine learning",
    "neural networks are used in deep learning"
]

# 2. 벡터화 (CountVectorizer)
# - TF-IDF가 아니라 단어 빈도를 써야 LDA에서 확률 해석이 가능
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents_lda)

# 3. LDA 모델 학습
# n_components: 토픽 개수
lda_model = LatentDirichletAllocation(
    n_components=2,
    random_state=42,
    learning_method='batch',
    max_iter=20
)
lda_model.fit(X)

# 4. 단어 사전 (피처 이름)
terms = vectorizer.get_feature_names()

# 5. 토픽별 주요 단어 출력
def print_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}: ", end="")
        top_features = topic.argsort()[:-n_top_words - 1:-1]
        print(" + ".join([f"{feature_names[i]} ({topic[i]:.2f})" for i in top_features]))

print("--- LDA 토픽 확인 ---")
print_topics(lda_model, terms, n_top_words=10)
'''
Topic 0: artificial (2.48) + field (2.48) + intelligence (2.48) + learning (1.95) + deep (1.50) + computer (1.50) + vision (1.50) + neural (1.49) + networks (1.49) + used (1.49)
Topic 1: learning (5.05) + machine (4.26) + natural (1.50) + techniques (1.50) + processing (1.50) + uses (1.50) + language (1.50) + deep (1.50) + combines (1.50) + statistics (1.50)
'''

# 6. 문서별 토픽 분포 확인
doc_topic_distr = lda_model.transform(X)

print("\n--- 문서별 토픽 분포 ---")
for i, topic_probs in enumerate(doc_topic_distr):
    print(f"Document {i}: {[(j, round(p, 3)) for j, p in enumerate(topic_probs)]}")
'''
Document 0: [(0, 0.698), (1, 0.302)]
Document 1: [(0, 0.1), (1, 0.9)]
Document 2: [(0, 0.066), (1, 0.934)]
Document 3: [(0, 0.915), (1, 0.085)]
Document 4: [(0, 0.076), (1, 0.924)]
Document 5: [(0, 0.883), (1, 0.117)]
'''

# 7. 토픽 일관성(Coherence) 유사 지표로 대체
# scikit-learn에는 gensim의 CoherenceModel이 없음.
# 대신 각 문서의 가장 높은 토픽 확률 평균을 간단한 "일관성" 근사값으로 사용 가능.
coherence_like = np.mean(np.max(doc_topic_distr, axis=1))
print(f"\nApprox. Coherence (mean dominant topic prob): {coherence_like:.3f}") # 0.876

--- LDA 토픽 확인 ---
Topic 0: artificial (2.48) + field (2.48) + intelligence (2.48) + learning (1.95) + deep (1.50) + computer (1.50) + vision (1.50) + neural (1.49) + networks (1.49) + used (1.49)
Topic 1: learning (5.05) + machine (4.26) + natural (1.50) + techniques (1.50) + processing (1.50) + uses (1.50) + language (1.50) + deep (1.50) + combines (1.50) + statistics (1.50)

--- 문서별 토픽 분포 ---
Document 0: [(0, 0.698), (1, 0.302)]
Document 1: [(0, 0.1), (1, 0.9)]
Document 2: [(0, 0.066), (1, 0.934)]
Document 3: [(0, 0.915), (1, 0.085)]
Document 4: [(0, 0.076), (1, 0.924)]
Document 5: [(0, 0.883), (1, 0.117)]

Approx. Coherence (mean dominant topic prob): 0.876


In [14]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. 텍스트 데이터 준비 (예시)
texts = [
    "I love this movie, it's amazing!",
    "This film was terrible, a complete waste of time.",
    "The acting was good but the plot was boring.",
    "Highly recommend this, a must-watch.",
    "Never watch this again, so bad.",
    "It was okay, not great not terrible."
]
labels = np.array([1, 0, 0, 1, 0, 0]) # 1: 긍정, 0: 부정

# 2. 텍스트 전처리 및 벡터화 (토큰화 및 시퀀스 패딩)
# num_words: 사용할 단어의 최대 개수
tokenizer = Tokenizer(num_words=1000, oov_token="<unk>")
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index

# 텍스트를 시퀀스로 변환
sequences = tokenizer.texts_to_sequences(texts)

# 시퀀스 패딩 (길이 맞추기)
# maxlen: 시퀀스의 최대 길이
# padding: 'pre' (앞에 0 채우기) 또는 'post' (뒤에 0 채우기)
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post')

# 3. 훈련/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.3, random_state=42, stratify=labels)

# 4. LSTM 모델 구축
# vocab_size: 단어 사전의 크기
# embedding_dim: 임베딩 벡터의 차원
# input_length: 입력 시퀀스의 길이 (maxlen과 동일)
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=100, input_length=10),
    SpatialDropout1D(0.2), # 과적합 방지
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid') # 이진 분류이므로 sigmoid
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
'''
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 10, 100)           3400      
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 10, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
=================================================================
Total params: 83,901
Trainable params: 83,901
Non-trainable params: 0
'''

# 5. 모델 학습
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

# 6. 예측 및 평가
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

print("\n--- LSTM 텍스트 분류 모델 평가 ---")
print(classification_report(y_test, y_pred))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 100)           3400      
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 10, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 83,901
Trainable params: 83,901
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.

--- LSTM 텍스트 분류 모델 평가 ---
        

  _warn_prf(average, modifier, msg_start, len(result))
