In [10]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer

# Setup text df of text

In [14]:
corpus_dict1 = {'a': 'This is the first document.',
          'b': 'This is the second second document.',
          'c': 'And the third one.',
          'd': 'Is this the first document?'}

df1 = pandas.DataFrame(corpus_dict1.items(), columns=['id', 'text'])

In [15]:
df1['popular'] = True

In [16]:
corpus_dict2 = {'e': 'more words.',
          'f': 'some junk.',
          'g': 'will this  end.',
          'h': 'when is lunch?'}

df2 = pandas.DataFrame(corpus_dict2.items(), columns=['id', 'text'])

In [17]:
df2['popular'] = False

In [21]:
pandas.concat([df1, df2])

Unnamed: 0,id,text,popular
0,a,This is the first document.,True
1,c,And the third one.,True
2,b,This is the second second document.,True
3,d,Is this the first document?,True
0,h,when is lunch?,False
1,e,more words.,False
2,g,will this end.,False
3,f,some junk.,False


In [4]:
# get list of 'text' column
list_of_texts = df['text'].tolist()
print(list_of_texts)

['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']


# Setup scikit vectorizer

In [5]:
vectorizer = CountVectorizer(min_df=1)
term_doc_matrix = vectorizer.fit_transform(list_of_texts)

In [6]:
vectorizer.get_feature_names()

[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']

In [7]:
print(vectorizer.get_feature_names())
print(term_doc_matrix.toarray())

[u'and', u'document', u'first', u'is', u'one', u'second', u'the', u'third', u'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


In [8]:
# Put BoW vectors into a new df
df_bow = pandas.DataFrame(term_doc_matrix.toarray(), columns=vectorizer.get_feature_names())

In [9]:
print(df_bow)

   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         1      0   1    0       2    1      0     1
2    1         0      0   0    1       0    1      1     0
3    0         1      1   1    0       0    1      0     1


# Merge two dfs

In [10]:
# Important: Make sure the concat() function uses the original id index of the first, text datafram
result = pandas.concat([df, df_bow], axis=1, join_axes=[df.index])

In [12]:
result

Unnamed: 0,id,text,and,document,first,is,one,second,the,third,this
0,1,This is the first document.,0,1,1,1,0,0,1,0,1
1,2,This is the second second document.,0,1,0,1,0,2,1,0,1
2,3,And the third one.,1,0,0,0,1,0,1,1,0
3,4,Is this the first document?,0,1,1,1,0,0,1,0,1
