# LDA on STS Benchmark dataset

### Requirements

In [1]:
import numpy as np
import scipy.sparse as ss
import pandas as pd
import pickle
import gensim

from corextopic import corextopic as ct
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt

from sentence_similarity.data import Pipeline, PipelineConfig, STSBenchmark

data_dir = Path("data")
assert data_dir.exists(), "data_dir does not exist."
output_dir = Path("data")
output_dir.mkdir(exist_ok=True, parents=True)

## Load the dataset

In [2]:
config = PipelineConfig(
    filtered_pos_tags=[],
    remove_stop_words=True,
    remove_numbers=True,
    remove_symbols=True,
    remove_punctuation=True,
)
pipeline = Pipeline(config)
config.save(data_dir)

In [3]:
# load the dataset
sts_benchmark = STSBenchmark(data_dir, partition="train")

In [4]:
# preprocess sentences
s1_preprocessed = pipeline(sts_benchmark.s1, split_tokens=True)
s2_preprocessed = pipeline(sts_benchmark.s2, split_tokens=True)

Preprocessing: 100%|██████████| 5552/5552 [00:03<00:00, 1553.06it/s]
Preprocessing: 100%|██████████| 5552/5552 [00:02<00:00, 1905.70it/s]


In [5]:
pd.concat([sts_benchmark.s1, s1_preprocessed], axis=1)

Unnamed: 0,s1,0
0,A plane is taking off.,"[plane, take]"
1,A man is playing a large flute.,"[man, play, large, flute]"
2,A man is spreading shreded cheese on a pizza.,"[man, spread, shred, cheese, pizza]"
3,Three men are playing chess.,"[man, play, chess]"
4,A man is playing the cello.,"[man, play, cello]"
...,...,...
5547,"Palestinian hunger striker, Israel reach deal","[palestinian, hunger, striker, Israel, reach, ..."
5548,Assad says Syria will comply with UN arms reso...,"[Assad, say, Syria, comply, UN, arm, resolution]"
5549,South Korean President Sorry For Ferry Response,"[south, korean, President, sorry, Ferry, Respo..."
5550,Food price hikes raise concerns in Iran,"[food, price, hike, raise, concern, Iran]"


In [6]:
texts = pd.concat([s1_preprocessed, s2_preprocessed])
# create id2word dictionary
id2word = gensim.corpora.Dictionary(texts)

In [7]:
# create lda model
lda = gensim.models.LdaMulticore(
   corpus=[id2word.doc2bow(text) for text in texts],
   num_topics=50,
   id2word=id2word,
   minimum_probability=0
)

In [8]:
# save
lda.save(str(output_dir / "lda"))

In [9]:
# # preprocess
# s = pipeline(sts_benchmark.s1, split_tokens=True)
# # vectorize
# s = [lda.id2word.doc2bow(tokens) for tokens in s]
# # apply topic model
# output = lda[s]
# # convert to array
# probs = np.array([[topic[1] for topic in out] for out in output])
# probs.shape