<a href="https://colab.research.google.com/github/kristynpantoja/math689project/blob/master/AVITM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch



In [0]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import Parameter
import torch.nn.functional as F
import math

In [0]:
class ProdLDA(nn.Module):

    def __init__(self, net_arch):
        super(ProdLDA, self).__init__()
        ac = net_arch
        self.net_arch = net_arch
        # encoder
        self.en1_fc     = nn.Linear(ac.num_input, ac.en1_units)             # 1995 -> 100
        self.en2_fc     = nn.Linear(ac.en1_units, ac.en2_units)             # 100  -> 100
        self.en2_drop   = nn.Dropout(0.2)
        self.mean_fc    = nn.Linear(ac.en2_units, ac.num_topic)             # 100  -> 50
        self.mean_bn    = nn.BatchNorm1d(ac.num_topic)                      # bn for mean
        self.logvar_fc  = nn.Linear(ac.en2_units, ac.num_topic)             # 100  -> 50
        self.logvar_bn  = nn.BatchNorm1d(ac.num_topic)                      # bn for logvar
        # z
        self.p_drop     = nn.Dropout(0.2)
        # decoder
        self.decoder    = nn.Linear(ac.num_topic, ac.num_input)             # 50   -> 1995
        self.decoder_bn = nn.BatchNorm1d(ac.num_input)                      # bn for decoder
        # prior mean and variance as constant buffers
        prior_mean   = torch.Tensor(1, ac.num_topic).fill_(0)
        prior_var    = torch.Tensor(1, ac.num_topic).fill_(ac.variance)
        prior_logvar = prior_var.log()
        self.register_buffer('prior_mean',    prior_mean)
        self.register_buffer('prior_var',     prior_var)
        self.register_buffer('prior_logvar',  prior_logvar)
        # initialize decoder weight
        if ac.init_mult != 0:
            #std = 1. / math.sqrt( ac.init_mult * (ac.num_topic + ac.num_input))
            self.decoder.weight.data.uniform_(0, ac.init_mult)
        # remove BN's scale parameters
#         self.logvar_bn .register_parameter('weight', None)
#         self.mean_bn   .register_parameter('weight', None)
#         self.decoder_bn.register_parameter('weight', None)
#         self.decoder_bn.register_parameter('weight', None)

    def forward(self, input, compute_loss=False, avg_loss=True):
        # compute posterior
        assert input.shape[1] == doc_term_matrix_tensor.shape[1], "input isn't batch size x vocab size"
        en1 = F.softplus(self.en1_fc(input))                            # en1_fc   output
        en2 = F.softplus(self.en2_fc(en1))                              # encoder2 output
        en2 = self.en2_drop(en2)
        posterior_mean   = self.mean_bn  (self.mean_fc  (en2))          # posterior mean
        posterior_logvar = self.logvar_bn(self.logvar_fc(en2))          # posterior log variance
        posterior_var    = posterior_logvar.exp()
        # take sample
        eps = Variable(input.data.new().resize_as_(posterior_mean.data).normal_()) # noise
        z = posterior_mean + posterior_var.sqrt() * eps                 # reparameterization
        assert z.shape[1] == self.net_arch.num_topic, "hidden variable z (from TR) isn't batch size x num_topic"
        p = F.softmax(z)                                                # mixture probability
        p = self.p_drop(p)
        assert p.shape[1] == self.net_arch.num_topic, "p (theta) isn't same size as z"
        # do reconstruction
        recon = F.softmax(self.decoder_bn(self.decoder(p)))             # reconstructed distribution over vocabulary
        assert input.shape[1] == doc_term_matrix_tensor.shape[1], "output isn't batch size x vocab size"
        
        if compute_loss:
            return recon, self.loss(input, recon, posterior_mean, posterior_logvar, posterior_var, avg_loss)
        else:
            return recon

    def loss(self, input, recon, posterior_mean, posterior_logvar, posterior_var, avg=True):
        # NL
        NL  = -(input * (recon+1e-10).log()).sum(1) # vector with batch-size number of elements
        # KLD, see Section 3.3 of Akash Srivastava and Charles Sutton, 2017, 
        # https://arxiv.org/pdf/1703.01488.pdf
        prior_mean   = Variable(self.prior_mean).expand_as(posterior_mean) # batch-size x num_topics
        prior_var    = Variable(self.prior_var).expand_as(posterior_mean)
        prior_logvar = Variable(self.prior_logvar).expand_as(posterior_mean)
        var_division    = posterior_var  / prior_var
        diff            = posterior_mean - prior_mean
        diff_term       = diff * diff / prior_var
        logvar_division = prior_logvar - posterior_logvar
        # put KLD together
        KLD = 0.5 * ( (var_division + diff_term + logvar_division).sum(1) - self.net_arch.num_topic )
#         print(KLD.mean())
        # loss
        loss = (NL + KLD)
        # in traiming mode, return averaged loss. In testing mode, return individual loss
        if avg:
            return loss.mean() # averaged over all the documents in the batch (1/batch_size)*sum
        else:
            return loss
          
def train():
    for epoch in range(args.num_epoch):
        all_indices = torch.randperm(doc_term_matrix_tensor.size(0)).split(args.batch_size)
        loss_epoch = 0.0
        model.train()                   # switch to training mode
        for batch_indices in all_indices:
            if not args.nogpu: batch_indices = batch_indices.cuda()
            input = Variable(doc_term_matrix_tensor[batch_indices])
#             print(batch_indices.shape)
#             print(input.shape)
            recon, loss = model(input, compute_loss=True)
            # optimize
            optimizer.zero_grad()       # clear previous gradients
            loss.backward()             # backprop
            optimizer.step()            # update parameters
            # report
            loss_epoch += loss.data[0]    # add loss to loss_epoch, then take the average in the print statement
        if epoch % 5 == 0:
            print('Epoch {}, loss={}'.format(epoch, loss_epoch / len(all_indices)))

In [0]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
# categories = ['talk.politics.guns', 'sci.space', 'soc.religion.christian',
#               'misc.forsale', 'rec.sport.baseball', 'comp.sys.mac.hardware']
categories = ['talk.politics.guns', 'sci.space']
# newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_train = fetch_20newsgroups(subset='train')


In [0]:
vectorizer = CountVectorizer(stop_words = 'english', min_df=.01, max_df=0.9, 
                             token_pattern = u'(?ui)\\b[a-z]{3,}\\b')
count_vecs = vectorizer.fit_transform(newsgroups_train.data)
doc_term_matrix = count_vecs.toarray()
doc_term_matrix.shape # number of documents, number of words (in vocab)
tokenizer = vectorizer.build_tokenizer()

In [41]:
doc_term_matrix.shape


(11314, 1739)

In [17]:
vectorizer.get_feature_names()

['ability',
 'able',
 'absolute',
 'absolutely',
 'academic',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'according',
 'account',
 'accurate',
 'acs',
 'act',
 'action',
 'actions',
 'active',
 'activities',
 'activity',
 'acts',
 'actual',
 'actually',
 'adams',
 'add',
 'added',
 'addition',
 'additional',
 'address',
 'administration',
 'admit',
 'advance',
 'advanced',
 'advantage',
 'advice',
 'afraid',
 'age',
 'agencies',
 'agency',
 'ago',
 'agree',
 'ahead',
 'air',
 'alan',
 'algorithm',
 'alive',
 'allen',
 'allow',
 'allowed',
 'allows',
 'alt',
 'alternative',
 'amendment',
 'america',
 'american',
 'americans',
 'analysis',
 'andrew',
 'andy',
 'angeles',
 'announcement',
 'anonymous',
 'answer',
 'answers',
 'anti',
 'anybody',
 'anymore',
 'apart',
 'apparently',
 'appear',
 'appears',
 'apple',
 'application',
 'applications',
 'applied',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'appropriate',
 'apr',
 'april',
 'arab',
 'archive',
 'area',
 'area

In [15]:
vectorizer.vocabulary_

{'umd': 1618,
 'edu': 493,
 'thing': 1564,
 'car': 219,
 'nntp': 1050,
 'posting': 1179,
 'host': 741,
 'university': 1628,
 'college': 290,
 'park': 1111,
 'wondering': 1712,
 'saw': 1361,
 'day': 393,
 'door': 466,
 'looked': 907,
 'late': 856,
 'early': 483,
 'called': 210,
 'really': 1265,
 'small': 1433,
 'addition': 25,
 'separate': 1393,
 'rest': 1321,
 'body': 175,
 'know': 842,
 'model': 995,
 'engine': 509,
 'years': 1733,
 'history': 732,
 'info': 777,
 'looking': 908,
 'mail': 928,
 'thanks': 1562,
 'brought': 194,
 'washington': 1679,
 'guy': 691,
 'clock': 282,
 'final': 598,
 'summary': 1515,
 'reports': 1306,
 'keywords': 834,
 'upgrade': 1632,
 'article': 96,
 'fair': 568,
 'number': 1063,
 'send': 1389,
 'message': 970,
 'speed': 1463,
 'cpu': 364,
 'add': 23,
 'cards': 221,
 'hour': 743,
 'floppy': 609,
 'disk': 451,
 'especially': 526,
 'days': 394,
 'network': 1041,
 'knowledge': 844,
 'base': 133,
 'haven': 710,
 'ecn': 490,
 'purdue': 1235,
 'thomas': 1569,
 'que

In [0]:
doc_term_matrix_tensor = torch.from_numpy(doc_term_matrix).float()

In [21]:
doc_term_matrix_tensor.shape

torch.Size([11314, 1739])

In [0]:
import argparse
from types import SimpleNamespace

In [0]:
args_dict = {"en1_units" : 100, "en2_units" : 100, "num_topic" : 50, 
             "batch_size" : 200, "optimizer" : 80, "learning_rate" : 0.002, 
             "momentum" : 0.99, "num_epoch" : 80, "init_mult" : 1, 
             "variance" : 0.995, "start" : True, "nogpu" : True}
args = SimpleNamespace(**args_dict)
args.num_input = doc_term_matrix_tensor.shape[1]

In [0]:
model = ProdLDA(args)

In [0]:
optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, betas=(args.momentum, 0.999))

In [25]:
train()



Epoch 0, loss=627.4693603515625
Epoch 5, loss=580.0742797851562
Epoch 10, loss=566.3651123046875
Epoch 15, loss=558.7833862304688
Epoch 20, loss=552.7109375
Epoch 25, loss=549.6472778320312
Epoch 30, loss=547.4059448242188
Epoch 35, loss=545.6531372070312
Epoch 40, loss=543.2401733398438
Epoch 45, loss=541.9542846679688
Epoch 50, loss=540.72607421875
Epoch 55, loss=539.7135009765625
Epoch 60, loss=540.2828369140625
Epoch 65, loss=537.5484008789062
Epoch 70, loss=538.0990600585938
Epoch 75, loss=535.3045043945312


In [0]:
associations = {
    'jesus': ['prophet', 'jesus', 'matthew', 'christ', 'worship', 'church'],
    'comp ': ['floppy', 'windows', 'microsoft', 'monitor', 'workstation', 'macintosh', 
              'printer', 'programmer', 'colormap', 'scsi', 'jpeg', 'compression'],
    'car  ': ['wheel', 'tire'],
    'polit': ['amendment', 'libert', 'regulation', 'president'],
    'crime': ['violent', 'homicide', 'rape'],
    'midea': ['lebanese', 'israel', 'lebanon', 'palest'],
    'sport': ['coach', 'hitter', 'pitch'],
    'gears': ['helmet', 'bike'],
    'nasa ': ['orbit', 'spacecraft'],
}
def identify_topic_in_line(line):
    topics = []
    for topic, keywords in associations.items():
        for word in keywords:
            if word in line:
                topics.append(topic)
                break
    return topics
def print_top_words(beta, feature_names, n_top_words=10):
    print('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        line = " ".join([feature_names[j] 
                            for j in beta[i].argsort()[:-n_top_words - 1:-1]])
        topics = identify_topic_in_line(line)
        print('|'.join(topics))
        print('     {}'.format(line))
    print('---------------End of Topics------------------')

In [16]:
sorted(vectorizer.vocabulary_, key = lambda x: x[1])

['car',
 'park',
 'saw',
 'day',
 'late',
 'early',
 'called',
 'mail',
 'washington',
 'fair',
 'cards',
 'days',
 'base',
 'haven',
 'mac',
 'gave',
 'way',
 'market',
 'machine',
 'maybe',
 'make',
 'taking',
 'machines',
 'daily',
 'dangerous',
 'far',
 'harvard',
 'launch',
 'cambridge',
 'basically',
 'values',
 'lawrence',
 'makes',
 'mass',
 'say',
 'hard',
 'gas',
 'hands',
 'says',
 'easily',
 'later',
 'magazine',
 'faster',
 'range',
 'fast',
 'data',
 'facts',
 'said',
 'hardware',
 'fault',
 'want',
 'david',
 'james',
 'main',
 'saying',
 'case',
 'fall',
 'man',
 'wants',
 'hand',
 'faith',
 'happy',
 'water',
 'packard',
 'east',
 'nasa',
 'vax',
 'mark',
 'based',
 'management',
 'lab',
 'language',
 'major',
 'sale',
 'tape',
 'happened',
 'war',
 'caused',
 'making',
 'save',
 'fact',
 'page',
 'takes',
 'handle',
 'rates',
 'cars',
 'paying',
 'taken',
 'rate',
 'pay',
 'dan',
 'damn',
 'law',
 'calls',
 'handling',
 'happen',
 'navy',
 'california',
 'san',
 'date

In [27]:
emb = model.decoder.weight.data.cpu().numpy().T
print("shape of beta is " + str(emb.shape))
print_top_words(emb, vectorizer.get_feature_names())

shape of beta is (50, 1739)
---------------Printing the Topics------------------
polit|nasa 
     mass countries health orbit forces budget land president german continue

     waco laws employer weapons land country batf criminal damage shouldn

     scheme nsa secure proposal enforcement crypto trust keys agencies encryption
comp 
     thanks appreciated nntp advance newsreader originator keywords appreciate windows copy

     afraid soldiers morning told moment come brought turned soviet later
comp 
     advance windows thanks sale brand video appreciated monitor ram electrical
comp 
     monitor sale thanks appreciated brand video purdue windows advance nntp

     max technical flight nasa applications functions design scheme sci additional

     privacy anonymous messages methods expires approach related frequently standards established

     angeles period brown van won goals detroit florida round diego

     nsa keys crypto corporation clipper scheme fred electronics brother wri

In [31]:
beta = model.decoder.weight.detach()#.softmax(0)
print(beta.sum(0))
print(beta.shape)
_, ind = torch.sort(beta, 0)
print(ind.shape)
# ind.numpy()[0:50, 0] - ind.numpy()[0:50, 1]
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 0])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 1])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 2])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 3])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 4])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 5])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 6])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 7])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 8])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 9])

# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:25, 4])

tensor([691.0438, 749.8492, 650.2081, 758.7484, 738.9827, 847.6075, 663.5479,
        851.0903, 808.6369, 837.2359, 813.2128, 679.8199, 592.9649, 769.5212,
        881.0038, 834.2672, 849.1828, 854.6403, 863.4843, 860.0076, 687.0235,
        715.2632, 702.8358, 867.0287, 775.7951, 790.4180, 842.8163, 840.6353,
        823.6600, 806.1698, 751.1285, 817.8627, 761.5077, 688.9834, 677.0889,
        675.3782, 781.6658, 829.0858, 625.1631, 790.7856, 818.1710, 876.3959,
        644.7657, 797.1841, 760.3964, 860.3668, 661.1928, 840.3171, 735.9488,
        763.0490])
torch.Size([1739, 50])
torch.Size([1739, 50])
['normal' 'protect' 'kill' 'dave' 'function' 'cost' 'unfortunately'
 'years' 'tell' 'normally' 'answer' 'navy' 'background' 'saw' 'room'
 'keeping' 'uses' 'sent' 'brought' 'wait' 'readers' 'issues' 'members'
 'order' 'steve']
['resources' 'equivalent' 'looking' 'errors' 'notice' 'includes' 'size'
 'lives' 'models' 'project' 'face' 'gateway' 'books' 'possibly' 'options'
 'make' 'earth' '

In [28]:
print(model.decoder.weight)
model.decoder.weight.data.cpu()

Parameter containing:
tensor([[ 0.4314,  0.5264,  1.0159,  ...,  0.0784,  0.3509,  0.1671],
        [ 0.3274,  0.4230,  0.8888,  ...,  0.6871,  0.3397,  0.6870],
        [ 0.4052,  0.5001,  0.6305,  ..., -0.3455,  0.1195, -0.1685],
        ...,
        [ 0.7040,  0.5514,  0.0681,  ...,  0.6256,  0.9422,  0.6802],
        [ 0.8155,  0.5323,  0.4090,  ...,  0.1142,  0.6618,  0.3802],
        [ 0.5573,  0.6776,  0.7772,  ...,  0.4308,  0.1695,  0.6157]],
       requires_grad=True)


tensor([[ 0.4314,  0.5264,  1.0159,  ...,  0.0784,  0.3509,  0.1671],
        [ 0.3274,  0.4230,  0.8888,  ...,  0.6871,  0.3397,  0.6870],
        [ 0.4052,  0.5001,  0.6305,  ..., -0.3455,  0.1195, -0.1685],
        ...,
        [ 0.7040,  0.5514,  0.0681,  ...,  0.6256,  0.9422,  0.6802],
        [ 0.8155,  0.5323,  0.4090,  ...,  0.1142,  0.6618,  0.3802],
        [ 0.5573,  0.6776,  0.7772,  ...,  0.4308,  0.1695,  0.6157]])

In [0]:
from itertools import product

def topic_coherence(beta, M, doc_term_matrix):
  K = beta.shape[1] # beta has dim V x K
  coherences = np.zeros(K)
  for t in range(K):
    index = np.argsort(-beta[:, t])[0:M]
    cart_prod = product(list(index), list(index))
    for ind1, ind2 in cart_prod:
      if ind1 == ind2:
        pass
      else:
        d_ind1 = (doc_term_matrix[:, ind1] > 0).sum()
        d_ind12 = ((doc_term_matrix[:, ind1] > 0) & (doc_term_matrix[:, ind2] > 0)).sum()
        coherences[t] += np.log1p(d_ind12) - np.log(d_ind1)

  return coherences

In [32]:
topic_coherence(model.decoder.weight.detach().numpy(), 20, doc_term_matrix)

array([ -940.68350986, -1089.19443133,  -665.23130023, -1047.9917143 ,
        -852.52685953, -1072.6192775 , -1129.08834205,  -986.63665234,
        -829.32375734,  -980.42470808, -1096.39154089, -1029.00389363,
        -957.09954657, -1068.115061  ,  -771.54474273,  -970.96084381,
        -894.44746154,  -922.56248175, -1286.42392919, -1043.45535985,
       -1117.77588538, -1201.08572892,  -913.05262934, -1145.45879259,
       -1170.2156843 ,  -810.48440123, -1052.76522774,  -837.45443347,
       -1013.76198521, -1012.04954999, -1293.52363687, -1222.26438538,
        -801.18408376, -1236.8629379 ,  -620.26362619,  -936.66689171,
       -1186.84567909,  -921.13844451, -1046.67030525,  -890.10525698,
        -904.500521  , -1192.37547215, -1243.65945156, -1144.67759423,
        -937.58385319, -1006.77922387,  -852.35614178,  -999.77922699,
        -886.12318095,  -998.90587616])

In [338]:
sorted(vectorizer.vocabulary_, key = lambda x: x[1])[0]

'car'

In [305]:
vocab = {"hi": 13, "bye": 2, "hello": 3}
foo = zip(*sorted(vocab.items(), key = lambda x: x[1]))
list(foo)

[('bye', 'hello', 'hi'), (2, 3, 13)]

In [0]:
def print_top_words(beta, feature_names, n_top_words=10):
    print('---------------Printing the Topics------------------')
    for i in range(len(beta)): # for all the rows (words in vocab) in beta,
        line = " ".join([feature_names[j] 
                            for j in beta[i].argsort()[:-n_top_words - 1:-1]])
#         topics = identify_topic_in_line(line)
#         print('|'.join(topics))
        print('     {}'.format(line))
    print('---------------End of Topics------------------')

In [63]:
print_top_words(beta.numpy().T, sorted(vectorizer.vocabulary_, key = lambda x: x[1]))

---------------Printing the Topics------------------
     view bear normal false fear waco arms mind english lot
     bit fact choice circuit budget trouble communication technical jews leads
     away mind happened america fear univ pick looking resources changes
     thread equal christ bet claim mil important greatly facts player
     carried store microsoft race helpful tax methods million england logic
     parts suggest works poster distribution german capable circuit serial peter
     trouble electrical bit fact choice field facts hot approach flame
     christ jesus start images face mil purchase toronto exists mike
     written wants newsgroups gas vice refer discussed carnegie brian faster
     face kept christ turkish studies type bother apart start current
     connection carried early tax blue million school large documentation background
     state trouble past bit recognize driving people fact communication accepted
     serial references athos circuit poster lack happen

['car',
 'park',
 'saw',
 'day',
 'late',
 'early',
 'called',
 'mail',
 'washington',
 'fair',
 'cards',
 'days',
 'base',
 'haven',
 'mac',
 'gave',
 'way',
 'market',
 'machine',
 'maybe',
 'make',
 'taking',
 'machines',
 'daily',
 'dangerous',
 'far',
 'harvard',
 'launch',
 'cambridge',
 'basically',
 'values',
 'lawrence',
 'makes',
 'mass',
 'say',
 'hard',
 'gas',
 'hands',
 'says',
 'easily',
 'later',
 'magazine',
 'faster',
 'range',
 'fast',
 'data',
 'facts',
 'said',
 'hardware',
 'fault',
 'want',
 'david',
 'james',
 'main',
 'saying',
 'case',
 'fall',
 'man',
 'wants',
 'hand',
 'faith',
 'happy',
 'water',
 'packard',
 'east',
 'nasa',
 'vax',
 'mark',
 'based',
 'management',
 'lab',
 'language',
 'major',
 'sale',
 'tape',
 'happened',
 'war',
 'caused',
 'making',
 'save',
 'fact',
 'page',
 'takes',
 'handle',
 'rates',
 'cars',
 'paying',
 'taken',
 'rate',
 'pay',
 'dan',
 'damn',
 'law',
 'calls',
 'handling',
 'happen',
 'navy',
 'california',
 'san',
 'date

In [341]:
np.bincount(np.array([4,6,3,6,8,2,6,78,89,5]))

array([0, 0, 1, 1, 1, 1, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1])

In [316]:
"?".join("bsc")

'b?s?c'

In [0]:
doc_term_matrix.