In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Preprocess the documents
def preprocess_documents(docs):
    processed_docs = []
    for doc in docs:
        processed_docs.append(simple_preprocess(doc))
    return processed_docs

# Create a vocabulary
def create_vocabulary(docs, min_count=5):
    dictionary = gensim.corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=min_count)
    return dictionary

# Train a Word2Vec model
def train_word_embedding_model(docs, size=100, window=5, min_count=5, workers=4):
    model = Word2Vec(docs, window=window, min_count=min_count, workers=workers)
    return model

# Apply the word embeddings to documents
def get_document_embeddings(docs, model):
    document_embeddings = []
    for doc in docs:
        word_embeddings = []
        for word in doc:
            if word in model.wv.key_to_index:
                word_embeddings.append(model.wv[word])
        if len(word_embeddings) > 0:
            document_embeddings.append(sum(word_embeddings) / len(word_embeddings))
        else:
            document_embeddings.append(np.zeros(model.vector_size))
    return document_embeddings


# Load the dataset
with open("merged_clean.txt", "r") as f:
    documents = f.read().split("\n\n\n\n")


preprocessed_docs = preprocess_documents(documents)
dictionary = create_vocabulary(preprocessed_docs)
model = train_word_embedding_model(preprocessed_docs)
document_embeddings = get_document_embeddings(preprocessed_docs, model)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/imrihaggin1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#Clustering

In [18]:
from sklearn.cluster import KMeans

# Example usage of clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(document_embeddings)
clusters = kmeans.labels_



In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize the documents using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Extract keywords using LDA
lda = LatentDirichletAllocation(n_components=10)
lda.fit(X)
topics = []
for topic in lda.components_:
    top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]]
    topics.append(top_words)
    print(top_words)

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Reduce the dimensionality of the document embeddings using t-SNE
tsne = TSNE(n_components=2, init='random')
X_tsne = tsne.fit_transform(X)

# Visualize the resulting clusters
# Create a scatter plot with labels
# Define the color map and the labels for the topics
cmap = plt.cm.get_cmap('viridis', len(topics))
labels = [f"Topic {i+1}" for i in range(len(topics))]

# Plot the scatter plot
fig, ax = plt.subplots()
sc = ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=topics, cmap=cmap)
plt.xlabel('t-SNE component 1')
plt.ylabel('t-SNE component 2')

# Add a color bar and a legend
cbar = plt.colorbar(sc, ticks=range(len(topics)))
cbar.ax.set_yticklabels(labels)
plt.legend(*sc.legend_elements(), loc="upper right", title="Topics")

# Show the plot
plt.show()



#Classification


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Example usage of classification
labels = [0, 0, 1] # Example labels for the documents
X_train, X_test, y_train, y_test = train_test_split(document_embeddings, labels, test_size=0.3, random_state=0)
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


ValueError: Found input variables with inconsistent numbers of samples: [1215, 3]