In [1]:
import pickle
import re
import sys
from pathlib import Path

import gensim
import nltk
import numpy as np
import pandas as pd
import scipy.sparse as ss
from corextopic import corextopic as ct
from gensim.utils import simple_preprocess
from tqdm.notebook import tqdm
from wordcloud import WordCloud

nltk.download('stopwords')
import sys
from pprint import pprint
from typing import List

import gensim.corpora as corpora
from nltk.corpus import stopwords
from tqdm.notebook import tqdm

src_dir = Path("data")
assert src_dir.exists()
output_dir = Path("data")
output_dir.mkdir(exist_ok=True, parents=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/iai/user/iser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_feather(src_dir / "subset.feather")
df

Unnamed: 0,s1,s2,score
0,The man is playing the wooden flute.,A group of girls stands wearing jean skirts an...,0.000000
1,The man is playing the wooden flute.,A group of girls stands wearing jean skirts an...,0.000000
2,The man is playing the wooden flute.,A group of girls stands wearing jean skirts an...,0.000000
3,The man is playing the wooden flute.,A group of girls stands wearing jean skirts an...,0.000000
4,A group of girls stands wearing jean skirts an...,The man is playing the wooden flute.,0.000000
...,...,...,...
965995,A woman is slicing a big pepper.,A woman is slicing big pepper.,0.998008
965996,A woman is slicing big pepper.,A woman is slicing a big pepper.,0.998008
965997,A woman is slicing a big pepper.,A woman is slicing big pepper.,0.998008
965998,A woman is slicing a big pepper.,A woman is slicing big pepper.,0.998008


In [3]:
stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def preprocess(sentences: List[str]):
    sentences = map(lambda x: re.sub('[,\.!?]', '', x), sentences)
    sentences = map(lambda x: x.lower(), sentences)
    sentences = list(sent_to_words(sentences))
    sentences = remove_stopwords(sentences)
    return sentences

In [4]:
sent_1 = preprocess(df.s1.tolist())
sent_2 = preprocess(df.s2.tolist())

In [5]:
texts = sent_1 + sent_2
# Create Dictionary
id2word = corpora.Dictionary(texts)
# Create Corpus
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
sent_1_corpus = [id2word.doc2bow(text) for text in sent_1]
sent_2_corpus = [id2word.doc2bow(text) for text in sent_2]
with open(output_dir / "id2word.pkl", "wb") as f:
    id2word.save(f)
# View
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1)]


In [6]:
# number of topics
num_topics = 16
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       minimum_probability=0)

In [7]:
lda_model.save(output_dir / "lda_model.bin")

AttributeError: 'PosixPath' object has no attribute 'endswith'

In [None]:
sent_1_lda = lda_model[sent_1_corpus]
sent_2_lda = lda_model[sent_2_corpus]

In [None]:
df['s1_lda'] = pd.Series([[prob[1] for prob in probs] for probs in tqdm(sent_1_lda)])
df['s2_lda'] = pd.Series([[prob[1] for prob in probs] for probs in tqdm(sent_2_lda)])
df.head()

In [None]:
df.to_feather(output_dir / 'df_lda.feather')