In [None]:
import pandas as pd
import json
import langid
from sklearn.feature_extraction.text import TfidfVectorizer
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk import pos_tag, word_tokenize
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from collections import Counter
import ast

# Download the NLTK data for part-of-speech tagging
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


### Helper Functions for cleaning

In [None]:
def remove_non_english_tokens(text):
    # Split the text into tokens
    #tokens = re.findall(r'\b\w+\b', text)

    # Identify the language of each token
    english_tokens = [word for word in text.split() if langid.classify(word)[0] == 'en']

    # Reconstruct the text with English tokens
    filtered_text = ' '.join(english_tokens)

    return filtered_text

In [None]:
def remove_proper_nouns(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Perform part-of-speech tagging
    pos_tags = pos_tag(words)

    # Remove proper nouns (NNP: singular proper noun, NNPS: plural proper noun)
    filtered_words = [word for word, pos in pos_tags if pos not in ['NNP', 'NNPS']]

    # Reconstruct the text without proper nouns
    filtered_text = ' '.join(filtered_words)

    return filtered_text

In [None]:
def remove_emails_and_hyperlinks(text):
    # Remove emails
    text_no_emails = re.sub(r'\S+@\S+', '', text)

    # Remove hyperlinks
    text_no_links = re.sub(r'http[s]?\S+', '', text_no_emails,flags=re.IGNORECASE)

    return text_no_links


In [None]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

### Extract Tags from Full Text

In [None]:
df=pd.read_csv("capstone_data.csv")
df=df[df['lang']=='en']

In [None]:
df["clean_content"]=df["content"].apply(remove_non_english_tokens)
df["clean_content"]=df["clean_content"].apply(remove_emails_and_hyperlinks)
df["clean_content"]=df["clean_content"].apply(remove_proper_nouns)
df["clean_content"]=df["clean_content"].apply(remove_numbers)

In [None]:
df.to_csv("df_clean.csv")

In [None]:
model = KeyBERT(model="distilbert-base-nli-mean-tokens")

# Function to extract keywords (tags) for each text
def extract_tags(text, top_n=5,keyprob=False):
    keywords = model.extract_keywords(text,top_n=top_n)
    return [keyword[0] for keyword in keywords if keyword[1]>=0.1]

# Apply the extract_tags function to each text in the DataFrame
df["tags_keybert"] = df["clean_content"].apply(lambda text: extract_tags(text, top_n=20,keyprob=True))

In [None]:
df.to_csv("keybert_thresh_onlyen_0.1_nltk.csv")

### Document Clusters from Full Text

In [None]:
df=pd.read_csv("capstone_data.csv")
df=df[df['lang']=='en']
df_key=pd.read_csv("keybert_thresh_onlyen_0.1_nltk.csv")

In [None]:
df=df.merge(df_key,left_on="id",right_on="id")
df=df[["id","content_x","tags_x","tags_keybert","lang_x"]]
df=df.rename(columns={"content_x": "content", "tags_x": "tags","lang_x":"lang"})

In [None]:
df["clean_content"]=df["content"].apply(remove_non_english_tokens)
df["clean_content"]=df["clean_content"].apply(remove_emails_and_hyperlinks)
df["clean_content"]=df["clean_content"].apply(remove_proper_nouns)
df["clean_content"]=df["clean_content"].apply(remove_numbers)

In [None]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract BERT embeddings for a document
def extract_bert_embeddings(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Extract BERT embeddings for each document
df['embeddings'] = df['clean_content'].apply(extract_bert_embeddings)

# Compute the linkage matrix for agglomerative clustering
embeddings = df['embeddings'].to_list()

# Apply agglomerative clustering with a chosen number of clusters



In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
linkage_matrix = linkage(embeddings, method='complete', metric ='cosine')
dendrogram(linkage_matrix)
plt.show()
n_clusters = 800
distances = linkage_matrix[:, 2]
distance_threshold = distances[-(n_clusters - 1)]
print("Estimated Distance Threshold:", distance_threshold)

In [None]:
agg_clustering = AgglomerativeClustering(n_clusters=None, linkage='complete', affinity='cosine',distance_threshold=0.20361340364626146)
df['cluster_label'] = agg_clustering.fit_predict(embeddings)

# Print the DataFrame with cluster labels
#print(df)


########################

# 750 -- 0.20361340364626146
# 500 -- 0.24620008010549643
# 650 -- 0.21801996529283096
# 800 -- 0.1958287028886031

In [None]:
df_clus=df[['id','cluster_label','tags','tags_keybert','content']]

In [None]:
df_clus.to_csv("full_text_clustering_id_750.csv")

In [None]:
# Display extracted tags for each cluster

unique_clusters = df["cluster_label"].unique()
for cluster_label in unique_clusters:
    cluster_tags=[]
    cluster_documents = df[df["cluster_label"] == cluster_label]["tags_keybert"].tolist()
    print(f"Cluster {cluster_label}:")
    for i, document in enumerate(cluster_documents):
        #print(f"{i + 1}. {document}")
        my_list = ast.literal_eval(document)
        #print(my_list)
        tags=[]
        for i in my_list:
            tags.append(i)
        #cluster_tags.append(tags)
        for k in tags:
            cluster_tags.append(k)
    print(cluster_tags)
    if len(cluster_tags)!=0:
        counter = Counter(cluster_tags)

        # Find the top 5 maximum frequency elements
        top_5 = counter.most_common(5)
    
        #print(f"Cluster {cluster_label}:")
        for i,tag in enumerate(top_5):
            print(f"{i + 1}. {tag}")
        print("\n")  
            #for j in k:
                
    
    print("\n")

In [None]:
# Display human given tags for each cluster
unique_clusters = df["cluster_label"].unique()
for cluster_label in unique_clusters:
    cluster_tags=[]
    cluster_documents = df[df["cluster_label"] == cluster_label]["tags"].tolist()
    for i,document in enumerate(cluster_documents):
        my_list = ast.literal_eval(document)
        #print(my_list)
        tags=[]
        for j in my_list:
            tags.append(j)
        #cluster_tags.append(tags)
        for k in tags:
            cluster_tags.append(k)
    print(cluster_tags)
    if len(cluster_tags)!=0:
        counter = Counter(cluster_tags)

        # Find the top 5 maximum frequency elements
        top_5 = counter.most_common(5)
    
        print(f"Cluster {cluster_label}:")
        for i,tag in enumerate(top_5):
            print(f"{i + 1}. {tag}")
        print("\n")
        

    

In [None]:
# Display document text for each cluster
unique_clusters = df["cluster_label"].unique()
for cluster_label in unique_clusters:
    cluster_documents = df[df["cluster_label"] == cluster_label]["content"].tolist()
    print(f"Cluster {cluster_label}:")
    for i, document in enumerate(cluster_documents):
        print(f"{i + 1}. {document}")
    print("\n")