### Load data

In [52]:
import numpy as np

with open('./sentiment_labelled_sentences/imdb_labelled.txt', 'r') as f:
    lines = f.readlines()
    imdb_pos_comments = []
    imdb_neg_comments = []
    for i in range(len(lines)):
        split = lines[i].split("\t")
        label = int(split[1])
        if label == 0:
            imdb_neg_comments.append(split[0])
        else:
            imdb_pos_comments.append(split[0])

print(len(imdb_pos_comments), len(imdb_neg_comments))

500 500


In [53]:
with open('./sentiment_labelled_sentences/amazon_cells_labelled.txt', 'r') as f:
    lines = f.readlines()
    amazon_pos_comments = []
    amazon_neg_comments = []
    for i in range(len(lines)):
        split = lines[i].split("\t")
        label = int(split[1])
        if label == 0:
            amazon_neg_comments.append(split[0])
        else:
            amazon_pos_comments.append(split[0])

print(len(amazon_pos_comments), len(amazon_neg_comments))

500 500


In [54]:
with open('./sentiment_labelled_sentences/yelp_labelled.txt', 'r') as f:
    lines = f.readlines()
    yelp_pos_comments = []
    yelp_neg_comments = []
    for i in range(len(lines)):
        split = lines[i].split("\t")
        label = int(split[1])
        if label == 0:
            yelp_neg_comments.append(split[0])
        else:
            yelp_pos_comments.append(split[0])

print(len(yelp_pos_comments), len(yelp_neg_comments))

500 500


### Preprocessing for training and testing data

In [56]:
# pool
train_pos_comments = imdb_pos_comments[:400] + amazon_pos_comments[:400] + yelp_pos_comments[:400]
test_pos_comments = imdb_pos_comments[400:] + amazon_pos_comments[400:] + yelp_pos_comments[400:]
train_neg_comments = imdb_neg_comments[:400] + amazon_neg_comments[:400] + yelp_neg_comments[:400]
test_neg_comments = imdb_neg_comments[400:] + amazon_neg_comments[400:] + yelp_neg_comments[400:]

comments = [train_pos_comments, test_pos_comments, train_neg_comments, test_neg_comments]
print([len(comments[i]) for i in range(len(comments))])

[1200, 300, 1200, 300]


In [57]:
# strip punctuation
# to lowercase

import pprint
import string, re
regex = re.compile('[%s]' % re.escape(string.punctuation))
comments = [[regex.sub('', comment.lower()) for comment in comment_group] for comment_group in comments]
pprint.pprint(comments[0][:10])

['the best scene in the movie was when gerardo is trying to find a song that '
 'keeps running through his head  ',
 'saw the movie today and thought it was a good effort good messages for '
 'kids  ',
 'loved the casting of jimmy buffet as the science teacher  ',
 'and those baby owls were adorable  ',
 'the movie showed a lot of florida at its best made it look very appealing  ',
 'the songs were the best and the muppets were so hilarious  ',
 'it was so cool  ',
 'this is a very right on case movie that delivers everything almost right in '
 'your face  ',
 'this review is long overdue since i consider a tale of two sisters to be the '
 'single greatest film ever made  ',
 'ill put this gem up against any movie in terms of screenplay cinematography '
 'acting postproduction editing directing or any other aspect of filmmaking  ']


In [60]:
# lemmatization

from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

lmtzr = WordNetLemmatizer()
lemmatized = [[[lmtzr.lemmatize(word, "v") for word in word_tokenize(s)]
              for s in comment_group] for comment_group in comments]
pprint.pprint(lemmatized[0][:10])

# lemmatizer not removing -ing because it assumes the word to be a noun.
# "s" in "was" was removed becasue it assumes it to be plural form.
# if we treat every word as verb, the "s" will also be removed because it will be considered as third person singular.

[['the',
  'best',
  'scene',
  'in',
  'the',
  'movie',
  'be',
  'when',
  'gerardo',
  'be',
  'try',
  'to',
  'find',
  'a',
  'song',
  'that',
  'keep',
  'run',
  'through',
  'his',
  'head'],
 ['saw',
  'the',
  'movie',
  'today',
  'and',
  'think',
  'it',
  'be',
  'a',
  'good',
  'effort',
  'good',
  'message',
  'for',
  'kid'],
 ['love',
  'the',
  'cast',
  'of',
  'jimmy',
  'buffet',
  'as',
  'the',
  'science',
  'teacher'],
 ['and', 'those', 'baby', 'owls', 'be', 'adorable'],
 ['the',
  'movie',
  'show',
  'a',
  'lot',
  'of',
  'florida',
  'at',
  'its',
  'best',
  'make',
  'it',
  'look',
  'very',
  'appeal'],
 ['the',
  'songs',
  'be',
  'the',
  'best',
  'and',
  'the',
  'muppets',
  'be',
  'so',
  'hilarious'],
 ['it', 'be', 'so', 'cool'],
 ['this',
  'be',
  'a',
  'very',
  'right',
  'on',
  'case',
  'movie',
  'that',
  'deliver',
  'everything',
  'almost',
  'right',
  'in',
  'your',
  'face'],
 ['this',
  'review',
  'be',
  'long',
  '

In [65]:
# eliminate duplicate
# modified the stopwords 

from nltk.corpus import stopwords

filtered = [[[word for word in set(word_list) if word not in stopwords.words('english_mod')] for word_list in comment_group] for comment_group in lemmatized]
pprint.pprint(filtered[:10])

# deleted stopwords: but, no, nor, not, don, don'tain, aren, aren't, couldn, couldn't, didn, didn't, doesn
# doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mustn't, needn
# needn't, shan, shan't, shouldn, shouldn't, wasn, wasn't, weren, weren't, won, won't, wouldn, wouldn't

[[['best',
   'gerardo',
   'song',
   'head',
   'run',
   'find',
   'keep',
   'movie',
   'try',
   'scene'],
  ['today', 'message', 'think', 'effort', 'saw', 'kid', 'good', 'movie'],
  ['science', 'jimmy', 'teacher', 'cast', 'buffet', 'love'],
  ['baby', 'adorable', 'owls'],
  ['lot', 'look', 'show', 'florida', 'appeal', 'make', 'movie', 'best'],
  ['hilarious', 'songs', 'muppets', 'best'],
  ['cool'],
  ['deliver', 'almost', 'everything', 'right', 'face', 'movie', 'case'],
  ['consider',
   'long',
   'single',
   'ever',
   'sisters',
   'review',
   'film',
   'overdue',
   'since',
   'greatest',
   'make',
   'tale',
   'two'],
  ['screenplay',
   'act',
   'movie',
   'cinematography',
   'edit',
   'term',
   'put',
   'ill',
   'direct',
   'postproduction',
   'aspect',
   'filmmaking',
   'gem'],
  ['masterpiece',
   'true',
   'masterpieces',
   'perfect',
   'faux',
   'practically',
   '\x96',
   'sea'],
  ['cinema', 'easily', 'tightly', 'construct', 'film', 'history'

  ['townsend',
   'vampire',
   'stuart',
   'fact',
   'like',
   'attempt',
   'interview',
   'better',
   'cruise',
   'lestat'],
  ['entrance',
   'dance',
   'first',
   'akasha',
   'compel',
   'aailiyah',
   'mini',
   'pretty',
   'good',
   'place',
   'scene'],
  ['mostly',
   'style',
   'fan',
   'anne',
   'sensitivities',
   'big',
   'due',
   'im',
   'series',
   'rice',
   'treatments'],
  ['like', 'believable', 'guess', 'dysfunctionhe', 'detail'],
  ['think', 'act', 'skilled', 'but'],
  ['meredith', 'right', 'better'],
  ['charm', 'film', 'wonderful', 'heart', 'sentiment'],
  ['time',
   'tell',
   'tale',
   'rare',
   'indulgent',
   'fall',
   'filmmaker',
   'moral',
   'trap',
   'take',
   'syrupy',
   'worthy',
   'overly',
   'doesnt',
   'love',
   'care'],
  ['nine', 'film', 'truly', 'lovely', 'ten'],
  ['good',
   'addition',
   'subgenre',
   'giallo',
   'film',
   'early',
   'goremeister',
   'future',
   'lucio',
   'fulci'],
  ['italian', '70s', 'e

   'night'],
  ['service', 'ambiance', 'food', 'phenomenal'],
  ['belly',
   'definitely',
   'venture',
   'time',
   'vegas',
   'return',
   'im',
   'next',
   'worth',
   'strip',
   'pork'],
  ['excellent', 'vodka', 'penne'],
  ['tasty',
   'crispy',
   'selection',
   'massive',
   'include',
   'food',
   'meatloaf',
   'chicken',
   'delish',
   'tuna',
   'melt',
   'sandwich',
   'good',
   'burgers',
   'wrap'],
  ['caper',
   'delicious',
   'lox',
   'bagels',
   'nyc',
   'real',
   'selections',
   'good',
   'even',
   'cream',
   'cheese'],
  ['every',
   'not',
   'meet',
   'fact',
   'great',
   'come',
   'expectations',
   'subway',
   'good'],
  ['solid', 'breakfast', 'seriously'],
  ['bar', 'food', 'vegas', 'one', 'best'],
  ['drink', 'really', 'great', 'never', 'empty', 'menu', 'make', 'suggestions'],
  ['blanket',
   'subpar',
   'moz',
   'nice',
   'like',
   'cover',
   'food',
   'feel',
   'top',
   'but'],
  ['clean', 'decorate', 'bathrooms', 'well', 'p

   'look',
   'cliff',
   'childhood',
   'ue',
   'eighth',
   'wonderfully',
   'hayao',
   'studio',
   'gake',
   'ponyo'],
  ['time',
   'film',
   'seem',
   'tell',
   'miyazaki',
   'stories',
   'refresh',
   'disneypixars',
   'handdrawn',
   'rely',
   'charm',
   'still',
   'animation',
   'traditional',
   'comfort',
   'enchant',
   'cgi',
   'masterpieces',
   'dominate',
   'know'],
  ['enough', 'not', 'film', 'animation', 'remarkable', 'say'],
  ['appearance',
   'style',
   'wonderfully',
   'fanciful',
   'art',
   'crayonpencil',
   'draw',
   'colorful'],
  ['frost',
   'end',
   'depict',
   'great',
   'sea',
   'super',
   'take',
   'still',
   'mighty',
   'make',
   'vessel'],
  ['consider',
   'look',
   'bonuses',
   'solid',
   'film',
   'add',
   'act',
   'excellent',
   'story'],
  ['sundays',
   '2005',
   'time',
   '20th',
   'watch',
   'ago',
   'entire',
   'march',
   'begin',
   'really',
   'enjoy',
   'tap',
   'thing',
   'two'],
  ['tv', '

  ['not', 'film', 'im', 'try', 'sure'],
  ['disturb',
   'guess',
   'turn',
   'memories',
   'succeed',
   'feel',
   'day',
   'good',
   'night'],
  ['force', 'like', 'everything', 'movie'],
  ['lame'],
  ['really', 'pied', 'movie', 'heres'],
  ['get', 'camerawork', 'dont', 'start', 'even', 'jerky'],
  ['think', 'saw', 'sick', 'theater', 'go'],
  ['witty', 'summary', 'werent', 'witticisms'],
  ['plot', 'id', 'let', 'well', 'say', 'one', 'go'],
  ['bad', 'act'],
  ['really', 'bad'],
  ['script', 'couldnt', 'rise', 'billy', 'bob', 'even', 'worse'],
  ['camerawork', 'bad'],
  ['rat', '10', '1'],
  ['watch', 'film', 'vomit', 'literally'],
  ['live',
   'find',
   'im',
   'movie',
   'offensive',
   'first',
   '5year',
   'intelligence',
   'work',
   'translate',
   'experience',
   'movies'],
  ['accurately',
   'reviewer',
   'often',
   'scenes',
   'movie',
   'someone',
   'get',
   'crap',
   'bunch',
   'kind',
   'but',
   'muddle',
   'story',
   'deliver',
   'strive',
   '

   'get',
   'alone',
   'quality',
   'buy',
   'defect',
   'risk',
   'but',
   'would',
   'might'],
  ['small', 'install', 'make', 'difficult', 'case'],
  ['not', 'fit'],
  ['really', 'bad', 'headset', 'nothing', 'theres', 'say'],
  ['bed',
   'wifi',
   'even',
   'leave',
   'go',
   'blue',
   'turn',
   'notice',
   'tooth',
   'morning',
   'charge',
   '20',
   'fully'],
  ['always',
   'card',
   'memory',
   'turn',
   'ive',
   'also',
   'problems',
   'phone',
   'read'],
  ['not', 'hat', 'wear', 'sunglasses', 'good'],
  ['phone', 'horrible'],
  ['battery', 'junk', 'bt50'],
  ['get', 'cant', 'software', 'work', 'computer'],
  ['disappoint'],
  ['clarity', 'voice', 'poor'],
  ['buyers', 'no', 'remorse', 'one'],
  ['accessoryone', 'disappoint'],
  ['phone',
   'carriers',
   'perhaps',
   'change',
   'inexcusable',
   'find',
   'return',
   'probably'],
  ['cumbersome', 'procedure', 'update', 'difficult'],
  ['disappointment', 'hate', 'ear', 'go', 'anything'],
  ['uncom

  ['much', 'price', 'think', 'go', 'rather', 'place', 'would'],
  ['service', 'fair', 'best'],
  ['underservices',
   'kabuki',
   'sushi',
   'overhip',
   'find',
   'but',
   'love',
   'overprice'],
  ['stay', 'favor', 'away', 'dish'],
  ['service', 'poor'],
  ['table', 'no', 'think', 'average', 'food', 'one', 'wait', 'worth'],
  ['pay', 'not', 'server', 'terrible', 'job', 'bill', 'tip', 'felt', 'but'],
  ['much',
   'never',
   'article',
   'food',
   'spice',
   'flavor',
   'focus',
   'bland',
   'consider',
   'read',
   'surprise'],
  ['small', 'food', 'fuck', 'portion', 'way', 'overprice'],
  ['really', 'buck', 'head', 'food', 'expect', 'better', '40'],
  ['back', 'wont'],
  ['not', 'dirty', 'believe', 'oyster', 'could'],
  ['no', 'star', 'place', 'deserve'],
  ['not', 'recommend', 'place', 'would'],
  ['qualify',
   'taste',
   'ever',
   'disbelief',
   'worst',
   'foods',
   'version',
   'dish'],
  ['wash',
   'low',
   'service',
   'people',
   'otherwise',
   'custo

### Construct feature vectors

In [103]:
train_comments = filtered[0] + filtered[2]
train_labels = [1]*1200 + [0]*1200
test_comments = filtered[1] + filtered[3]
test_labels = [1]*300 + [0]*300
print(len(train_comments), len(test_comments))

2400 600


In [72]:
word_set = set()
for comment in train_comments:
    for word in comment:
        word_set.add(word)
print(len(word_set))

3975


In [87]:
import pandas as pd

train_f = np.zeros((len(train_comments), len(word_set)))
test_f = np.zeros((len(test_comments), len(word_set)))

train_df = pd.DataFrame(train_f, columns=word_set)
print(train_df.shape)
for i, comment in enumerate(train_comments):
    for word in comment:
        train_df.iloc[i][word] += 1
train_df = train_df.astype('int32')
print(train_df.iloc[0], train_df.iloc[1])

test_df = pd.DataFrame(test_f, columns=word_set)
print(test_df.shape)
for i, comment in enumerate(test_comments):
    for word in comment:
        if word in test_df.columns:
            test_df.iloc[i][word] += 1
test_df = test_df.astype('int32')

(2400, 3975)
companion      0
crayon         0
crisp          0
reviewer       0
theme          0
whoa           0
moz            0
offend         0
excelent       0
avocado        0
movie          1
span           0
beyond         0
thorn          0
shell          0
closeup        0
brief          0
research       0
chinese        0
anytime        0
speed          0
engage         0
sound          0
candle         0
piano          0
certainly      0
story          0
transform      0
hackneyed      0
hand           0
              ..
luvs           0
constantly     0
shed           0
mind           0
hour           0
role           0
fantastic      0
pander         0
direct         0
amazingrge     0
heres          0
luck           0
option         0
eiko           0
mesquite       0
baklava        0
greenstreet    0
nano           0
fantasy        0
snug           0
drink          0
clean          0
format         0
small          0
capability     0
speaker        0
fraction       0
s

### Postprocess with log norm

In [101]:
train_norm = np.log(train_df.values + 1)
test_norm = np.log(test_df.values + 1)
print(train_norm.shape)

(2400, 3975)


### Logistic regression on bag-of-words model

In [109]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0).fit(train_norm, train_labels)
# clf.predict(test_norm)
lr_score = lr.score(test_norm, test_labels)
print(lr_score)

0.8316666666666667


In [110]:
print(lr.coef_)
lr_top_identifiers = []
for idx in np.argsort(lr.coef_)[:100]:
    lr_top_identifiers.append(test_df.columns[idx])
print(lr_top_identifiers)

[[ 0.09273245 -0.13037811  0.41258218 ...  0.26019976 -0.12338114
   0.23647963]]
[Index(['bad', 'not', 'poor', 'worst', 'terrible', 'didnt', 'awful', 'waste',
       'suck', 'disappointment',
       ...
       'awesome', 'fantastic', 'best', 'amaze', 'good', 'nice', 'excellent',
       'delicious', 'love', 'great'],
      dtype='object', length=3975)]


### Multinomial naive bayes on bag-of-words model

In [111]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB().fit(train_norm, train_labels)
mnb_score = mnb.score(test_norm, test_labels)
print(mnb_score)

mnb_top_identifiers = []
for idx in np.argsort(mnb.coef_)[:100]:
    mnb_top_identifiers.append(test_df.columns[idx])
print(mnb_top_identifiers)

0.825
[Index(['gore', 'z', 'badwellits', 'boob', 'québec', 'lugosi', 'reflect',
       'stereotypically', 'carlys', 'sorry',
       ...
       'well', 'one', 'movie', 'love', 'work', 'but', 'phone', 'film', 'good',
       'great'],
      dtype='object', length=3975)]


### Classification on 2-gram model

In [113]:
gram_set = set()
for comment in train_comments:
    for i in range(1, len(comment)):
        gram_set.add(comment[i-1] + " " + comment[i])
print(len(gram_set))    

11654


In [114]:
gram_train_f = np.zeros((len(train_comments), len(gram_set)))
gram_test_f = np.zeros((len(test_comments), len(gram_set)))

gram_train_df = pd.DataFrame(gram_train_f, columns=gram_set)
print(gram_train_df.shape)

for i, comment in enumerate(train_comments):
    for j in range(1, len(comment)):
        gram = comment[j-1] + " " + comment[j]
        gram_train_df.iloc[i][gram] += 1
gram_train_df = gram_train_df.astype('int32')
print(gram_train_df.iloc[0], gram_train_df.iloc[1])

gram_test_df = pd.DataFrame(gram_test_f, columns=gram_set)
print(gram_test_df.shape)
for i, comment in enumerate(test_comments):
    for j in range(len(comment)):
        gram = comment[j-1] + " " + comment[j]
        if gram in gram_test_df.columns:
            gram_test_df.iloc[i][gram] += 1
gram_test_df = gram_test_df.astype('int32')

(2400, 11654)
part near           0
drain 50            0
shoot use           0
ago like            0
think heist         0
totally burger      0
reduction wine      0
actingwise wasnt    0
william john        0
great value         0
price anyways       0
food meatloaf       0
half lose           0
youll thin          0
like cover          0
stagy attention     0
paradise work       0
make little         0
make grime          0
spacek great        0
horrible pull       0
assistant bakery    0
like live           0
find product        0
bar cant            0
food us             0
leave hungry        0
doesnt asia         0
whine hour          0
use documentary     0
                   ..
every worth         0
wall watch          0
wire long           0
minute important    0
camera balance      0
rochonwas but       0
phone razr          0
time monster        0
may go              0
color costume       0
eye black           0
curry dish          0
care interest       0
even car          

In [115]:
gram_train_norm = np.log(gram_train_df.values + 1)
gram_test_norm = np.log(gram_test_df.values + 1)
print(gram_train_norm.shape)

gram_lr = LogisticRegression(random_state=0).fit(gram_train_norm, train_labels)
gram_lr_score = gram_lr.score(gram_test_norm, test_labels)
print(gram_lr_score)

gram_lr_top_identifiers = []
for idx in np.argsort(gram_lr.coef_)[:100]:
    gram_lr_top_identifiers.append(gram_test_df.columns[idx])
print(gram_lr_top_identifiers)

gram_mnb = MultinomialNB().fit(gram_train_norm, train_labels)
gram_mnb_score = gram_mnb.score(gram_test_norm, test_labels)
print(gram_mnb_score)

gram_mnb_top_identifiers = []
for idx in np.argsort(gram_mnb.coef_)[:100]:
    gram_mnb_top_identifiers.append(gram_test_df.columns[idx])
print(gram_mnb_top_identifiers)

(2400, 11654)
0.605
[Index(['waste time', 'not good', 'phone not', 'ever worst', 'dont go',
       'service bad', 'but place', 'not recommend', 'not back', 'minutes get',
       ...
       'place love', 'waitress friendly', 'ever best', 'price great',
       'give 10', 'recommend highly', 'phone great', 'great film',
       'great work', 'great food'],
      dtype='object', length=11654)]
0.6083333333333333
[Index(['part near', 'turn definitely', 'would probably', 'time warmer',
       'publicly figure', 'obviously customer', 'place anyone', 'help forth',
       'unfortunately high', 'steep worth',
       ...
       'give 10', 'price great', 'dont but', 'recommend highly', 'ever best',
       'but work', 'phone great', 'great work', 'great food', 'great film'],
      dtype='object', length=11654)]


### Classification with PCA

In [131]:
from sklearn.decomposition import PCA

def reconstruct(org, n):
    u, s, vt = np.linalg.svd(org, full_matrices=False)
    return np.dot(np.dot(u[:, :n], np.diag(s)[:n, :n]), vt[:n, :])
    
# u, s, vt = np.linalg.svd(train_norm, full_matrices=False)
# print(np.dot(np.dot(u, np.diag(s)), vt))
# print(train_norm)
# print(PCA(n_components=100).fit_transform(train_norm))

train_r = []
test_r = []

train_r.extend((reconstruct(train_norm, 10), 
               reconstruct(train_norm, 50), 
               reconstruct(train_norm, 100), 
               reconstruct(gram_train_norm, 10),
               reconstruct(gram_train_norm, 50), 
               reconstruct(gram_train_norm, 100)))
test_r.extend((reconstruct(test_norm, 10), 
              reconstruct(test_norm, 50), 
              reconstruct(test_norm, 100), 
              reconstruct(gram_test_norm, 10),
              reconstruct(gram_test_norm, 50), 
              reconstruct(gram_test_norm, 100)))

# u, s, vt = np.linalg.svd(train_norm)
# train_r10 = np.dot(np.dot(u[:, :10], np.diag(s)[:10, :10]), vt[:10,:])
# train_r50 = np.dot(np.dot(u[:, :50], np.diag(s)[:50, :50]), vt[:50,:])
# train_r100 = np.dot(np.dot(u[:, :100], np.diag(s)[:100, :100]), vt[:100,:])

# gram_u, gram_s, gram_vt = np.linalg.svd(gram_train_norm)
# gram_train_r10 = np.dot(np.dot(gram_u[:, :10], np.diag(gram_s)[:10, :10]), gram_vt[:10,:])
# gram_train_r50 = np.dot(np.dot(gram_u[:, :50], np.diag(gram_s)[:50, :50]), gram_vt[:50,:])
# gram_train_r100 = np.dot(np.dot(gram_u[:, :100], np.diag(gram_s)[:100, :100]), gram_vt[:100,:])

In [133]:
def classify(model, train_x, train_y, test_x, test_y, test_df):
    if model == "lr":
        print("lr")
        clf = LogisticRegression(random_state=0).fit(train_x, train_y)
    if model == "mnb":
        print("mnb")
        clf = MultinomialNB().fit(train_x, train_y)
    score = clf.score(test_x, test_y)
    print(score)

    top_identifiers = []
    for idx in np.argsort(clf.coef_)[:100]:
        top_identifiers.append(test_df.columns[idx])
    print(top_identifiers)

test_dfs = [test_df, gram_test_df]
for i in range(len(train_r)):
    if i < 3:
        a = 0
    else:
        a = 1
    classify("lr", train_r[i], train_labels, test_r[i], test_labels, test_dfs[a])
    classify("mnb", train_r[i]+1, train_labels, test_r[i]+1, test_labels, test_dfs[a])

lr
0.6266666666666667
[Index(['not', 'time', 'go', 'but', 'back', 'dont', 'would', 'get', 'bad',
       'even',
       ...
       'quality', 'product', 'really', 'well', 'price', 'food', 'film', 'work',
       'good', 'great'],
      dtype='object', length=3975)]
mnb
0.6066666666666667
[Index(['communications', 'communicate', 'holster', 'selfdiscovery', 'voyage',
       'essentially', 'items', 'distract', 'photo', 'ad',
       ...
       'make', 'food', 'one', 'movie', 'but', 'phone', 'film', 'work', 'good',
       'great'],
      dtype='object', length=3975)]
lr
0.685
[Index(['bad', 'not', 'dont', 'no', 'would', 'much', 'waste', 'ever', 'minutes',
       'disappoint',
       ...
       'product', 'excellent', 'well', 'recommend', 'best', 'good', 'nice',
       'price', 'love', 'great'],
      dtype='object', length=3975)]
mnb
0.665
[Index(['reccommend', 'uneasy', 'veal', 'girlfriends', 'badwellits', 'edible',
       'dialogs', 'shallow', 'insincere', 'victor',
       ...
       'well'