In [3]:
!pip install scikit-learn-extra

Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.3.0


In [18]:
import os
import re
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import pairwise_distances

# Mount Google Drive
drive.mount('/content/drive')

# Function to clean text data
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra white spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    return text

# Function to read and clean text files
def read_clean_text_files(folder_path):
    texts = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                cleaned_text = clean_text(text)
                texts.append(cleaned_text)
    return texts

# Path to the folder containing text files
folder_path = '/content/drive/My Drive/BDA 798 LLMs/TestCase 3 Papers text/Input_Raw_3'

# Read and clean text files
texts = read_clean_text_files(folder_path)

print(f'texts count={len(texts)}')
print(f'texts={texts}')

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)

# Clustering
n_clusters = 5  # Number of clusters
kmedoids = KMedoids(n_clusters=n_clusters, random_state=0).fit(tfidf_matrix)
cluster_labels = kmedoids.labels_

print(f'cluster_labels={cluster_labels}')

# Evaluation
silhouette_score_val = silhouette_score(tfidf_matrix, cluster_labels)
davies_bouldin_score_val = davies_bouldin_score(tfidf_matrix.toarray(), cluster_labels)
calinski_harabasz_score_val = calinski_harabasz_score(tfidf_matrix.toarray(), cluster_labels)
pairwise_distance_matrix = pairwise_distances(tfidf_matrix, metric='euclidean')
dunn_index = np.min(pairwise_distance_matrix[np.nonzero(pairwise_distance_matrix)]) / \
             np.max([np.max(pairwise_distance_matrix[i, np.nonzero(pairwise_distance_matrix[i, :])]) for i in range(len(pairwise_distance_matrix))])

# Print evaluation scores
print("Silhouette Score:", silhouette_score_val)
print("Davies Bouldin Index:", davies_bouldin_score_val)
print("Calinski-Harabasz Index:", calinski_harabasz_score_val)
print("Dunn Index:", dunn_index)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
texts count=10
texts=['a b s t r a c t feelings of belonging are integral in peoples choice of what career to pursue women and men are dispropor tionately represented across careers starting with academic training the present research focuses on two fields that are similar in their history and subject matter but feature inverse gender gapspsychology more women than men and philosophy more men than womento investigate how theorized explanations for academic gender gaps contribute to feelings of belonging specifically we simultaneously model the relative contribution of theoretically relevant individual differences empathizing systematizing and intellectual combativeness as well as life goals prioritization of family money and status to feelings of belonging and majoring in psychology or philosophy we find that men report higher intellectual combativeness than 