In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
! pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 18.3MB/s 
Collecting funcy (from pyLDAvis)
  Downloading https://files.pythonhosted.org/packages/b3/23/d1f90f4e2af5f9d4921ab3797e33cf0503e3f130dd390a812f3bf59ce9ea/funcy-1.12-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.12 pyLDAvis-2.1.2


In [0]:
from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
import os
import pandas as pd
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.gensim as gensimvis
import plotly.plotly as py
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
from plotly.offline import plot

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
stopwords = set(stopwords.words('russian'))

In [0]:
class lda(object):
  def __init__(self, path_to_texts, path_to_sentiment, add_bigramms=False):
    assert os.path.exists(path_to_texts) and os.path.exists(path_to_sentiment)
    
    self.texts = [[word for word in line.split() if word not in stopwords] for line in pd.read_pickle(path_to_texts)['lemmatized']]
    self.vocab = None
    self.model = None
    self.clusters_distribution = {}
    self.general_statistic = {}
    self.path_to_sentiment = path_to_sentiment
    self.clusters_titles = None
    
    if add_bigramms:
      finder = Phrases(self.texts)
      for i in range(len(self.texts)):
        for word in finder[self.texts[i]]:
          if '_' in word:
            self.texts[i].append(word)
            
  def create_vocab(self, no_below=5, no_above=0.5):
    self.vocab = Dictionary(self.texts)
    self.vocab.filter_extremes(no_below=no_below, no_above=no_above)
  
  def vectorize_texts(self):
    self.texts = [self.vocab.doc2bow(sent) for sent in self.texts]
    
    
  def train_ldamodel(self, num_topics, chunksize, epochs, steps, eval_every=None, id2word=None):
    if id2word==None:
      temp = self.vocab[0]
      id2word = self.vocab.id2token
    
    self.model = LdaModel(self.texts, num_topics=num_topics, id2word=id2word, chunksize=chunksize, passes=epochs,
                    alpha='auto', eta='auto', iterations=steps, eval_every=None)
    
  def write_clusterring_t0_html_file(self, title):
    vis = gensimvis.prepare(self.model, self.texts, self.vocab)
    with open(os.path.join('/content', title + '.html'), 'w') as file:
      pyLDAvis.save_html(vis, file)
      
  def get_cluster_distribution_and_sentiment(self):
    labels = pd.read_csv(self.path_to_sentiment)['label']
    #ids = pd.read_csv(self.path_to_sentiment)['id']
    for index, label in enumerate(labels):
      if label =='negative' or label == 'positive':
        cluster = sorted(self.model.get_document_topics(self.texts[index]), key=lambda x: x[1], reverse=True)[0][0]
        if cluster in self.clusters_distribution:
          self.clusters_distribution[cluster][0].append(index)
          self.clusters_distribution[cluster][1].append(label)
        else:
          self.clusters_distribution[cluster] = [[index], [label]]
  
  def grab_general_statistic(self):
    for k, v in self.clusters_distribution.items():
      self.general_statistic[k] = {'positives': v[1].count('positive'), 'negatives': v[1].count('negative')}
      
  def return_statistics_for_ploting(self):
    positives = [elem['positives'] for elem in self.general_statistic.values()]
    negatives = [elem['negatives'] for elem in self.general_statistic.values()]
    
    return positives, negatives, self.clusters_titles
  
  def rename_clusters(self, names):
    self.clusters_titles = [i for i in names]

In [0]:
class plot_statistics(object):
  def __init__(self, max_figures):
    self.data = []
    self.buttons = []
    self.current_number_of_figures = 0
    self.max_figures_available = max_figures
    
  def add_one_more_bank(self, x, y, title, clusters_ids):
    trace1 = go.Bar(x = [i for i in clusters_ids],
                  y=x,
                  name='positives')
    trace2 = go.Bar(x = [i for i in clusters_ids],
                  y=y,
                  name='negatives')
    
    self.data.append(trace1)
    self.data.append(trace2)
    
    
    button = dict(label=title,
                 method='update',
                 args=[{'visible':[False]*2*self.current_number_of_figures + \
                        [True, True] + \
                        [False]*2*(self.max_figures_available - self.current_number_of_figures-1)},
                      {'title': title,
                      'annotations':[]}])
    self.buttons.append(button)
    self.current_number_of_figures+=1
    
  def create_visualization(self):
    updatemenus = list([dict(type='buttons',
                            active=-1,
                            buttons = self.buttons)])
    
    layout = go.Layout(barmode='group', updatemenus=updatemenus)
    fig = dict(data=self.data, layout=layout)
    plot(fig)

In [0]:
number_of_files = 3
vis = plot_statistics(number_of_files)

In [0]:
def make_one_iteration(path_to_texts, path_to_sentiment, output_filename):
  one = lda(path_to_texts, path_to_sentiment, True)
  one.create_vocab()
  one.vectorize_texts()
  one.train_ldamodel(num_topics = 5, chunksize = 2000, epochs = 1, steps = 100)
  print('model for {n} trained successfully'.format(n=output_filename))
  one.write_clusterring_t0_html_file(output_filename)
  print('file for {n} is ready'.format(n = output_filename))
  one.get_cluster_distribution_and_sentiment()
  one.grab_general_statistic()
  titles = input().split(':')
  one.rename_clusters(titles)
  x, y, clusters_ids = one.return_statistics_for_ploting()
  vis.add_one_more_bank(x, y, output_filename, clusters_ids)
  print('statistic is updated')

In [30]:
path = '/content/gdrive/My Drive/sentences_replies.pkl'
path_ = '/content/gdrive/My Drive/banki ru csv/otpbank_sentimented.csv'
make_one_iteration(path, path_, 'otpbank')

model for otpbank trained successfully
file for otpbank is ready
кредиты:платежы по кредитам:ресепшн:отделения:погашение кредитов
statistic is updated


In [0]:
vis.create_visualization()