In [None]:
!pip install sentence_transformers
!pip install gensim
!pip install sklearn

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter

import gensim.downloader as api
from sentence_transformers import SentenceTransformer

import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
stoplist = set(stopwords.words('english'))

# vec_model = api.load("glove-wiki-gigaword-50")
# api.load("glove-twitter-50")
# vec_model['word']

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [3]:
df = pd.read_csv("../../data/narrativeqa_qas.csv")

questions = df.question

questions.head()

0                            Who is Mark Hunter?
1      Where does this radio station take place?
2    Why do more students tune into Mark's show?
3                           Who commits suicide?
4        What does Paige jam into her microwave?
Name: question, dtype: object

In [None]:
q_words = set(['who','what','how','where','when','why','which','whom','whose', "who's"])
other_useful_terms = set([])

In [None]:
whitelist = q_words.union(stoplist).union(other_useful_terms)

contents = []
for row in questions:
    contents.append(" ".join([word for word in row.lower().split(" ") if word in whitelist]))
    #contents.append(" ".join([word for word in row.lower().split(" ")]))

In [None]:
print(Counter(contents).most_common(30))

In [None]:
# Convert to sentence embeddings (maybe keep the indexes the same for future reference)
sentence_embeddings = []

#sentence_data = questions
sentence_data = contents
n_samples = 2000 

for row in sentence_data[:n_samples]:
    se = sbert_model.encode(row)
    sentence_embeddings.append(se)

In [None]:
# Cluster sentence embeddings
# The silhouette coefficients for each k
silhouette_coefficients = []

# The Within-Cluster-Sum-of-Squares for each k
wcss = []

kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

ceil = 15

# Silhouette coefficient cannot work with less than 2 clusters
for k in range(2, ceil):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(sentence_embeddings)
    score = silhouette_score(sentence_embeddings, kmeans.labels_)
    silhouette_coefficients.append(score)
    wcss.append(kmeans.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, ceil), silhouette_coefficients)
plt.xticks(range(2, ceil))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
plt.plot(range(2, ceil), wcss)
plt.xticks(range(2, ceil))
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
# Off the basis of this second graph, 8 clusters should be used. 
n_clusters = 5 
final_model = KMeans(n_clusters, **kmeans_kwargs)
final_model.fit(sentence_embeddings)

In [None]:
# labelled_contents = zip(contents[:500], final_model.labels_)
rdf = pd.DataFrame()
rdf['sentence'] = sentence_data[:n_samples]
rdf['label'] = final_model.labels_
rdf['answer1'] = df.answer1[:n_samples]
rdf['answer2'] = df.answer2[:n_samples]
rdf

In [None]:
rdf.loc[rdf['label'] == 0]

#print(contents[1000])