In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split


In [3]:
def load_dataset(path):
    df = pd.read_csv(path, sep='\t', header=None, usecols=[1, 2], names=["label", "text"])
    df = df[df['label'].isin(['true', 'false'])]  # Only keep true and false labels
    df['label'] = df['label'].map({'true': 0, 'false': 1})  # Convert labels to numeric values
    return df

In [4]:
train_df = load_dataset('train.tsv')
val_df = load_dataset('valid.tsv')
test_df = load_dataset('test.tsv')


In [5]:
combined_df = pd.concat([train_df, val_df])
train_df, val_df = train_test_split(combined_df, test_size=0.2, random_state=42)


In [6]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train = vectorizer.fit_transform(train_df['text'])
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])


In [7]:
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train)




In [8]:
val_clusters = kmeans.predict(X_val)
val_silhouette_score = silhouette_score(X_val, val_clusters)
print("Validation silhouette score:", val_silhouette_score)


Validation silhouette score: 0.0025846213554643407


In [9]:
val_df['cluster'] = val_clusters
cluster_sizes = val_df.groupby('cluster')['label'].count()
cluster_true_ratios = val_df.groupby('cluster')['label'].mean()

print("Cluster sizes:", cluster_sizes)
print("Cluster true ratios:", cluster_true_ratios)


Cluster sizes: cluster
0    776
1     45
Name: label, dtype: int64
Cluster true ratios: cluster
0    0.564433
1    0.644444
Name: label, dtype: float64
