In [1]:
import pandas as pd
import gensim
import csv

from pathlib import Path
from utils import preprocess

data_dir = Path("data")
assert data_dir.exists(), "data_dir does not exist."
output_dir = Path("data")
output_dir.mkdir(exist_ok=True, parents=True)

In [2]:
%%capture
df_sts = pd.read_csv(data_dir / "stsbenchmark" / 'sts-train.csv', error_bad_lines=False, header = None, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_sts = df_sts.rename(columns={0: "genre", 1: "filename", 2: "year", 3: "trash", 4: "score", 5: "s1", 6: "s2"})

In [3]:
df_sts.genre = df_sts.genre.astype("category")
df_sts.filename = df_sts.filename.astype("category")
df_sts.year = df_sts.year.astype("category")
df_sts.genre = df_sts.genre.astype("category")

In [4]:
df_sts

Unnamed: 0,genre,filename,year,trash,score,s1,s2
0,main-captions,MSRvid,2012test,1,5.00,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.80,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.60,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...,...,...,...,...
5547,main-news,headlines,2015,1489,1.20,"Palestinian hunger striker, Israel reach deal",Palestinian activist detained in Israeli raid
5548,main-news,headlines,2015,1493,4.80,Assad says Syria will comply with UN arms reso...,Syria's Assad vows to comply with U.N. resolution
5549,main-news,headlines,2015,1496,4.60,South Korean President Sorry For Ferry Response,S. Korean president 'sorry' for ferry disaster
5550,main-news,headlines,2015,1498,0.00,Food price hikes raise concerns in Iran,American Chris Horner wins Tour of Spain


In [6]:
# preprocess unique sentences
texts = preprocess(pd.concat([df_sts.s1, df_sts.s2]).value_counts().index)
# create id2word dictionary
id2word = gensim.corpora.Dictionary(texts)

In [7]:
# create lda model
lda = gensim.models.LdaMulticore(
   corpus=[id2word.doc2bow(text) for text in texts],
   num_topics=10,
   id2word=id2word,
   minimum_probability=0
)

In [9]:
# save
lda.save(str(output_dir / "lda"))