In [1]:
!pip install sklearn
!pip install gensim

Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2343 sha256=619174b376d7be0d2d578cd1015bbd0c0503ce509f111325679ab6f4924a0e8a
  Stored in directory: /Users/harrychang/Library/Caches/pip/wheels/14/25/f7/1cc0956978ae479e75140219088deb7a36f60459df242b1a72
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1
Collecting gensim
  Downloading gensim-4.3.1-cp38-cp38-macosx_10_9_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting smart-open>=1.8.1
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/5

In [2]:
from sklearn.datasets import fetch_20newsgroups
from gensim.models import Word2Vec

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

def preprocess_newsgroups(newsgroups_data):
    sentences = [document.split('\n') for document in newsgroups_data.data]
    sentences = [[sentence.split() for sentence in document] for document in sentences]
    return [sentence for document in sentences for sentence in document]

sentences = preprocess_newsgroups(newsgroups)
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4, sg=1)
model.save("newsgroups_word2vec.model")

In [21]:
def find_similar_words(model, word, topn=5):
    return model.wv.most_similar(word, topn=topn)

word2vec_model = Word2Vec.load("newsgroups_word2vec.model")
word = "computer"
similar_words = find_similar_words(word2vec_model, word)
print(f"Similar words to '{word}':")
for word, similarity in similar_words:
    print(f"{word} - {similarity:.2f}")

Similar words to 'computer':
workstation - 0.77
lab - 0.76
bulletin - 0.75
implementing - 0.73
packet - 0.73


In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [9]:
def document_vector(document, model):
    doc_vec = np.zeros(model.vector_size)
    word_count = 0
    for word in document:
        if word in model.wv:
            doc_vec += model.wv[word]
            word_count += 1
    if word_count == 0:
        return doc_vec
    return doc_vec / word_count

In [10]:
newsgroups_word2vec_model = Word2Vec.load("newsgroups_word2vec.model")

articles = newsgroups.data
labels = newsgroups.target

In [11]:
X = np.array([document_vector(article.split(), newsgroups_word2vec_model) for article in articles])
y = np.array(labels)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4607427055702918


In [15]:
positive_words = ['good', 'excellent', 'amazing', 'positive', 'happy']
negative_words = ['bad', 'terrible', 'awful', 'negative', 'unhappy']



In [16]:
def assign_sentiment(document):
    sentiment_score = 0
    for word in positive_words:
        sentiment_score += document.count(word)
    for word in negative_words:
        sentiment_score -= document.count(word)
    return 1 if sentiment_score > 0 else 0

In [17]:
sentiment_labels = [assign_sentiment(article) for article in articles]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, sentiment_labels, test_size=0.2)

sentiment_classifier = LogisticRegression()
sentiment_classifier.fit(X_train, y_train)
y_pred = sentiment_classifier.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print("Sentiment accuracy:", accuracy)

Sentiment accuracy: 0.8636604774535809
