## Topic modeling using neural attention for aspect extraction

Plan for the project:
1. Text exploration.
2. Text preprocessing using different tools, including byte pair encoding (BPE).
3. Training model.
4. Evaluation of results.    
    4.1. Evaluation by coherense score.    
    4.2. Evaluation by text classifacation.
5. Discription of future steps for the project.

## 1. Loading data 

In [306]:
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm
from importlib import reload 
import text_preprocess
from text_preprocess import  preprocess , remove_rare_words, process_words_for_LDA, text_to_id

from spacy.lemmatizer import Lemmatizer, Lookups
from spacy.lang.ru import RussianLemmatizer
from pprint import pprint
# Gensim
import gensim, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.models import Word2Vec

import pymorphy2


import matplotlib.pyplot as plt    
    
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('russian')
stop.remove('не')

import model
from model import get_aspect_matrix

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Graphs
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# Pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from model import Net, MaxMarginLoss


# папка текущего файла

PATH = os.path.abspath(os.getcwd())

# Папка исходных файлов
DATA_PATH = os.path.join(PATH,'data')

DATA_PATH


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'C:\\Users\\ivan\\YandexDisk\\DS\\NLP course Huawei\\Project NLP\\data'

In [307]:
reload(text_preprocess)
reload(model)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Extending stop words: 458it [00:00, 458142.44it/s]

Number of stop words:  609





<module 'model' from 'C:\\Users\\ivan\\YandexDisk\\DS\\NLP course Huawei\\Project NLP\\model.py'>

Dataset consists of 2 separate dataframes with positive and negative twits. Each dataframe consist more then 100000 rows.


Loading **positive** twits:

In [None]:
pos_df = pd.read_csv(os.path.join(DATA_PATH,'positive.csv'), header = None, sep=';')

# column titles
cols = ['id',
       'tdate',
       'tmane',
       'ttext',
       'ttype',
       'trep',
       'trtw',
       'tfav',
       'tstcount',
       'tfol',
       'tfrien',
       'listcount'
       ]

pos_df.columns = cols
pos_df.head(5)

Loading **negative** twits:    

In [None]:
neg_df = pd.read_csv(os.path.join(DATA_PATH,'negative.csv'), header = None, sep=';')

neg_df.columns = cols
neg_df.head(5)

For further work we will concatenate dataframes with only text and its sentiment, and mix rows:

In [None]:
pos_df['sentiment'] = 1
neg_df['sentiment'] = 0
train_df = pd.concat([pos_df, neg_df]).sample(frac=1, random_state=12).reset_index(drop=True)
train_df = train_df[['ttext', 'sentiment']]
train_df

As we can see corpus is very raw and need to be preprocessed. Steps for preprocessing are described below.


# 2. Data exploration


Let's have look at length of the twits and amount of words in twit

In [None]:
train_df['twit_len'] = train_df['ttext'].map(len)

# filter text list if there are no spaces
train_df['words_num'] = [len(list(filter(lambda x: x!='' , text.split(' ')))) for text in train_df['ttext']]

In [None]:
train_df['twit_len'].describe()

In [None]:
train_df['twit_len'].plot(kind='hist', bins=50, figsize=(10,5));

In [None]:
train_df['words_num'].describe()

In [None]:
train_df['words_num'].plot(kind='hist', bins=50, figsize=(10,5));

## 3. Text preprocess

In this step the prepering processes are:
1. Convert characters to lower case.
2. Remove words with characters length less than 3.
3. Remove non-alphabetic symbols.
4. Byte pair encoding of unknown words.
5. Convert different emoticons to several types.

In [None]:
#reload(text_preprocess)

corpus = train_df['ttext']

train_df['clean_text'] = preprocess(corpus)

train_df['clean_twit_len'] = train_df['clean_text'].map(len)

# filter text list if there are no spaces
train_df['clean_words_num'] = [len(list(filter(lambda x: x!='' , text.split(' ')))) for text in train_df['clean_text']]

In [None]:
train_df['clean_twit_len'].plot(kind='hist', bins=50, figsize=(10,5));

As we can see distribution became more normal than it was before preprocessing

In [None]:
train_df['clean_words_num'].plot(kind='hist', bins=50, figsize=(10,5));

Removing rows with number of words more than 20:

In [None]:
train_df= train_df[train_df['clean_words_num']<20]

In [None]:
train_df['clean_words_num'].plot(kind='hist', bins=50, figsize=(10,5));

In [None]:
train_df.head(15)

We will remove words that appear very rare in the corpus since they are unique and can not give us enough information about topics

In [None]:
corpus = train_df['clean_text'].values

# number is min count values to cut words
corpus_clean = remove_rare_words(corpus, 25, 30)# num of appearence of rare words and n most frequent tokens

#### Checking results

Creating dataframe from dictionary to check result how much words in dictionary

In [None]:
dic = corpora.Dictionary(corpus_clean)

#dictionary word:word id
dict_words = dic.token2id

#dictionary word id: number of word
dic_id_nums = dic.cfs

In [None]:
dict_df = pd.DataFrame.from_dict({'token':list(dict_words.keys()), 'count_word': list(dic_id_nums.values())})
len(dict_df)

In [None]:
dict_df.sort_values(by='count_word', ascending=False)

In [None]:
dict_df[dict_df['token']=='михаэль']

## 3.2 Text preprocess for LDA model

Creating bigrams:

In [None]:
corpus_preprocessed = process_words_for_LDA(corpus_clean)

In [None]:
len(corpus_preprocessed)

Clean dataset if there are empty docs

In [None]:
corpus_preprocessed = [doc for doc in corpus_preprocessed if len(doc)!=0]

Check length of corpus after cleaning:


In [None]:
len(corpus_preprocessed)

In [None]:
dic = corpora.Dictionary(corpus_preprocessed)

#dictionary word:word id
dict_words = dic.token2id

#dictionary word id: number of word
dic_id_nums = dic.cfs

dict_df = pd.DataFrame.from_dict({'token':list(dict_words.keys()), 'count_word': list(dic_id_nums.values())})
len(dict_df)

In [None]:
dict_df.sort_values(by='count_word', ascending=False)

Text with indexes of words in vocablary instead of words:

In [None]:
indexed_text = text_to_id(corpus_preprocessed, word2id)
indexed_text[:10]

## 4.Model implementation

### 4.1. LDA model

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(corpus_preprocessed)

# Create Corpus: Term Document Frequency
corpus_freq = [id2word.doc2bow(text) for text in corpus_preprocessed]

# Create word-id dictionary
word2id = id2word.token2id

Let's find optimum number of topics by calculating coherense coefficient:

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step), desc = 'Model working'):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           random_state=12,
                                           update_every=1,
                                           chunksize=10,
                                           passes=3,
                                           alpha='symmetric',
                                           iterations=50,
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
%%time
# Num of topics to choose
start=2
limit=20
step=2


model_list, coherence_values = compute_coherence_values(dictionary=id2word, 
                                                        corpus=corpus_freq, 
                                                        texts=corpus_for_LDA, 
                                                        start=start, 
                                                        limit=limit, 
                                                        step=step)
# Show graph
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
max_cv = max(coherence_values)
best_lda_model = model_list[coherence_values.index(max_cv)]
best_lda_model.save('best_lda_model')

In [None]:
%%time 
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus_freq,
                                           id2word=id2word,
                                           num_topics=12, 
                                           random_state=14,
                                           #update_every=1,
                                           chunksize=10,
                                           passes=3,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True,
                                           dtype=np.float64,
                                           workers = 3)

pprint(lda_model.print_topics())

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_freq, dictionary=id2word)
vis

Result of topic modeling with LDA:
1. Optimizing number of topics by higher coherence score was reached optimum number 14. Maximum score is 0.44.
2. For some group of words we can form topic, but for most of them it is not vivid.


### 4.2. Attention based model

In [272]:
VOCAB_SIZE = len(id2word)
EMB_SIZE = 200
N_ASPECTS = 14
MAX_LEN = 20
NEG_SAMPLES = 20
BATCH_SIZE = 50
seed = 4

In [273]:
VOCAB_SIZE

5822

In [274]:
# w2v_model = Word2Vec(sentences=[one_txt], size=200, window=10, min_count=10, workers=3, negative=5)
w2v_model = Word2Vec(sentences=corpus_preprocessed, size=200, window=5, min_count=1, workers=3, batch_words=10)

if not os.path.exists(PATH + '/pre_trained_model/'):
    os.makedirs(PATH + '/pre_trained_model/')

w2v_model.save(PATH + '/pre_trained_model/model_param_my')

Let's check how good enough word2vec model:

In [275]:
w2v_model.most_similar('кушать', topn = 20)

  w2v_model.most_similar('кушать', topn = 20)


[('поесть', 0.9762868285179138),
 ('жрать', 0.9742609262466431),
 ('выпить', 0.9601010084152222),
 ('кроватка', 0.946843147277832),
 ('тренировка', 0.9462007284164429),
 ('насморк', 0.942147970199585),
 ('постель', 0.9413187503814697),
 ('вечером', 0.9405970573425293),
 ('завтрак', 0.9370702505111694),
 ('устать', 0.933862566947937),
 ('вода', 0.9317432045936584),
 ('пошлый', 0.9311273097991943),
 ('пипец', 0.9307984113693237),
 ('сесть', 0.9287680983543396),
 ('ранний', 0.928617000579834),
 ('живот', 0.9283583164215088),
 ('целый', 0.9282732009887695),
 ('посидеть', 0.9269917011260986),
 ('одеяло', 0.9269794225692749),
 ('доехать', 0.9268539547920227)]

In [276]:
np_embeddings = np.zeros(shape=[VOCAB_SIZE, EMB_SIZE], dtype=np.float64)

for word, vec in w2v_model.wv.vocab.items():
    vector = w2v_model.wv.get_vector(word)
    idx = word2id.get(word)
    np_embeddings[idx] = vector

In [277]:
len(w2v_model.wv.vocab)

5822

In [278]:
np_embeddings_norm = np_embeddings / np.linalg.norm(np_embeddings, axis=-1, keepdims=True)

In [279]:
np_embeddings_norm.shape

(5822, 200)

Plot word embeddings projected on 3D space

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model.wv[word])
        labels.append(word)
    
    tokens_norm = tokens/ np.linalg.norm(tokens, axis=-1, keepdims=True)
    
    tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens_norm)

    x = []
    y = []
    z = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        z.append(value[2])
    plt.figure(figsize=(16, 16)).gca(projection='3d')
    for i in range(len(x)):
        plt.scatter(x[i],y[i], z[i],cmap='tab10')
       
        #ax.set_xlabel('pca-one')
        #ax.set_ylabel('pca-two')
        #ax.set_zlabel('pca-three')
    plt.show()

In [None]:
tsne_plot(w2v_model)

### 4.3 Aspect matrix

In [280]:
aspect_matrix = get_aspect_matrix(np_embeddings, N_ASPECTS)

### 5. Model


### 5.1 Pretrained embeddings

In [310]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
#torch.cuda.manual_seed(seed)
#torch.backends.cudnn.deterministic = True

<torch._C.Generator at 0x22fd4d2c430>

In [311]:
TORCH_DEVICE ='cpu'# 'cuda' # 'cpu'

In [312]:
torch.device(TORCH_DEVICE)
#torch.set_default_tensor_type(torch.cuda.FloatTensor) 

device(type='cpu')

In [313]:
pretrained_embeddings = torch.FloatTensor(np_embeddings_norm)
pretrained_embeddings.shape

torch.Size([5822, 200])

### 5.2 Text to torch

In [314]:
def texts_to_seq(texts_to_ids, maxlen=MAX_LEN):
    pad_token = 0
    sequences = [
        [pad_token]*(maxlen - len(txt)) + txt[-maxlen:]
        for txt in texts_to_ids
        if len(txt) != 0
    ]
    return torch.tensor(sequences, dtype=torch.long, device=TORCH_DEVICE)

In [315]:
padded_texts = texts_to_seq(indexed_text, maxlen=MAX_LEN)
padded_texts.shape

torch.Size([216133, 20])

In [316]:
pos_neg_loader = DataLoader(
    dataset=PosNegDataset(padded_texts, neg_size=NEG_SAMPLES),
    batch_size=BATCH_SIZE,
    shuffle=True,
#     num_workers=4,
)

In [317]:
pos_neg_loader

<torch.utils.data.dataloader.DataLoader at 0x22f98f22910>

### 5.3 Training model

In [319]:
model = Net(vocab_size=VOCAB_SIZE, emb_dim=EMB_SIZE, maxlen=MAX_LEN, n_aspects=N_ASPECTS, 
            pretrained_embeddings=pretrained_embeddings, aspect_matrix=aspect_matrix)

print(model)
print("Parameters:", sum([param.nelement() for param in model.parameters()]))

model.train()


AssertionError: 
The NVIDIA driver on your system is too old (found version 8000).
Please update your GPU driver by downloading and installing a new
version from the URL: http://www.nvidia.com/Download/index.aspx
Alternatively, go to: https://pytorch.org to install
a PyTorch version that has been compiled with your version
of the CUDA driver.

### 6. Testing model

In [None]:
from torch.utils.data import DataLoader
from model import PosNegDataset

### 7. Results evaluation