# Code setup same as "final_project" notebook except limited to Unigrams. Jump to "Explore Top Words" section for new code.

In [None]:
# import nltk library
import nltk; nltk.download('punkt')
from nltk import sent_tokenize, pos_tag
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordTokenizer
nltk.download('averaged_perceptron_tagger')

# import stopword libraries
nltk.download('stopwords'); from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

# import other libraries
import pandas as pd
import numpy as np
import string 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
#from sklearn.grid_search import GridSearchCV
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools


# import clustering library
from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import cosine_distance

from sklearn.cluster import KMeans


# import word embedding library
import glove_helper

# import helper libraries
import collections
from common import utils, vocabulary

[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /content/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


ModuleNotFoundError: ignored

## Load data

load sample data of restaurant businesses 

In [None]:
# load data

sample_df=pd.read_csv("restaurants_only_804868.csv")

## Preprocess data

clean text data, recode variables, identify stop words, create train/test sets

### clean text data

In [None]:
example_text="""Very pleased with the service. Friendly, attentive, and fast. I had vegetable egg rolls and Pad Thai. 
              Pad Thai was exquisite. Not too oily or too dry, just perfect. Just the right amount of food on the plate, 
              the tofu was baked to perfection and made the flavor stand out. The egg rolls were crispy but not over 
              fried and not oily either. Definitely coming back and recommending it to friends."""

# tokenize
def tokenize_text(input_text):
    """
    Args: 
    input_text: a string representing an 
    individual review
        
    Returns:
    input_token: a list containing stemmed 
    tokens, with punctutations removed, for 
    an individual review
        
    """
    input_tokens=[]
    sb=SnowballStemmer("english", ignore_stopwords=True)
    translator=str.maketrans('', '', string.punctuation)
        
    # Split sentence
    sents=sent_tokenize(input_text)
            
    # Split word
    for sent in sents:
        sent=sent.translate(translator)
        #input_tokens+=[sb.stem(token) for token in TreebankWordTokenizer().tokenize(sent)] #stem
        input_tokens+=TreebankWordTokenizer().tokenize(sent) #don't stem
        
    return input_tokens


# canonicalize
def canonicalize_tokens(input_tokens):
    """
    Args:
    input_tokens: a list containing tokenized 
    tokens for an individual review
    
    Returns:
    input_tokens: a list containing canonicalized 
    tokens for an individual review
    
    """
    input_tokens=utils.canonicalize_words(input_tokens)
    return input_tokens

# return nouns only

def nouns_only(canonical_tokens):
    
    tagged_tokens = pos_tag(canonical_tokens)
    
    return tagged_tokens



# preprocessor 
def preprocessor(raw_text):
    """
    Args:
    raw_text: a string representing an
    individual review
    
    Returns:
    preprocessed_text: a string representing 
    a preprocessed individual review
    
    """
    # tokenize
    tokens=tokenize_text(raw_text)
    
    # canonicalize
    canonical_tokens=canonicalize_tokens(tokens)
    
    # rejoin string
    preprocessed_text=(" ").join(canonical_tokens) 
    return preprocessed_text

# example data
# input_tokens=tokenize_text(example_text)
# print("\nINPUT TOKENS:", input_tokens)

# canonical_tokens=canonicalize_tokens(input_tokens)
# print("\nCANONICAL TOKENS:", canonical_tokens)

# tagged_tokens = nouns_only(canonical_tokens)
# print("\nTAGGED TOKENS:", tagged_tokens)

# preprocessed_text=preprocessor(example_text) 
# print("\nPREPROCESSED TEXT:", preprocessed_text)


INPUT TOKENS: ['Very', 'pleased', 'with', 'the', 'service', 'Friendly', 'attentive', 'and', 'fast', 'I', 'had', 'vegetable', 'egg', 'rolls', 'and', 'Pad', 'Thai', 'Pad', 'Thai', 'was', 'exquisite', 'Not', 'too', 'oily', 'or', 'too', 'dry', 'just', 'perfect', 'Just', 'the', 'right', 'amount', 'of', 'food', 'on', 'the', 'plate', 'the', 'tofu', 'was', 'baked', 'to', 'perfection', 'and', 'made', 'the', 'flavor', 'stand', 'out', 'The', 'egg', 'rolls', 'were', 'crispy', 'but', 'not', 'over', 'fried', 'and', 'not', 'oily', 'either', 'Definitely', 'coming', 'back', 'and', 'recommending', 'it', 'to', 'friends']

CANONICAL TOKENS: ['very', 'pleased', 'with', 'the', 'service', 'friendly', 'attentive', 'and', 'fast', 'i', 'had', 'vegetable', 'egg', 'rolls', 'and', 'pad', 'thai', 'pad', 'thai', 'was', 'exquisite', 'not', 'too', 'oily', 'or', 'too', 'dry', 'just', 'perfect', 'just', 'the', 'right', 'amount', 'of', 'food', 'on', 'the', 'plate', 'the', 'tofu', 'was', 'baked', 'to', 'perfection', 'and

### recode variables

In [None]:
# get reviews, ratings data
text=sample_df["text"].tolist() # list of strings
labels=sample_df["stars"].tolist() # list of integers

# recode labels: ratings 1,2 vs. 3 vs. 4,5
def trinary_recode(x):
    if x==1 or x==2: x=0 # includes ratings 1,2
    elif x==4 or x==5: x=2 # includes ratings 4,5
    else: x=1 # includes only rating 3    
    return x

recoded_trinary_labels=list(map(trinary_recode, labels))

# recode labels: ratings 1,2 vs. 4,5 (exclude rating 3)
#filtered_df=sample_df.drop(sample_df[sample_df.stars==3].index)
#filtered_text=filtered_df["text"].tolist() 
#filtered_labels=filtered_df["stars"].tolist() 

#def binary_recode(x):
    #if x==1 or x==2: x=0 # includes ratings 1,2
    #else: x=1 # includes ratings 4,5    
    #return x

#recoded_binary_labels=list(map(binary_recode, filtered_labels))

print(text[0])

I usually LOVE hot n juicy, but today's food was just eh. I got a pound of shrimp, with garlic butter sauce, mild heat. I like my shrimps to be fresh; not all mushy and gross. Half of my shrimp was all mushy and I definitely lost my appetite. When my shrimp came in a bag, it was very securely tied at the top with a layer of two bags; that tells me that it's been in that sauce for way too long. I was super disappointed. The waiters we had were super attentive, although one of them seemed to be in a hurry because she kept grabbing the check when we haven't paid yet and asking if the check was ready when clearly we were still eating. Very unhappy with the hot n juicy on eastern, maybe the one on spring mountain is better. I used to like hot n juicy better than boiling crab but now I'm not so sure.


### identify stopwords

In [None]:
# sklearn stopwords (frozenset)
sklearn_stopwords=stop_words.ENGLISH_STOP_WORDS
print("number of sklearn stopwords: %d" %(len(sklearn_stopwords)))
#print(sklearn_stopwords)

# nltk stopwords (list)
nltk_stopwords=stopwords.words("english")
print("number of nltk stopwords: %d" %(len(nltk_stopwords)))
#print(nltk_stopwords)

# combined sklearn, nltk, other stopwords (set)
total_stopwords=set(list(sklearn_stopwords.difference(set(nltk_stopwords)))+nltk_stopwords)

other_stopwords=["DG", "DGDG"]
for w in other_stopwords:
    total_stopwords.add(w)
    
print("number of total stopwords: %d" %(len(total_stopwords)))

number of sklearn stopwords: 318
number of nltk stopwords: 179
number of total stopwords: 380


### create train, test sets

In [None]:
# using all labels
#train_data, test_data, train_labels, test_labels=train_test_split(text, labels, test_size=.2) 

# using recoded labels
train_data, test_data, train_labels, test_labels=train_test_split(text, recoded_trinary_labels, test_size=.2,
                                                                   random_state=101)

# examine train, test shapes
print("train, test set size: %d, %d" %(len(train_data), len(test_data))) 
print("train, test label size: %d, %d" %(len(train_labels), len(test_labels)))
print("")

# examine train set examples
print("example:")
print("text: %s" %(train_data[1]))
print("label: %d" %(train_labels[1]))

train, test set size: 643894, 160974
train, test label size: 643894, 160974

example:
text: Took the girl out for a date night last October.  It's not a cheap date, but it was a very nice atmosphere for a date.  The nights were still reasonably warm, so sitting on the patio was lovely.

The salmon appetizer is to die for.  Yes, I'm using that phrase, to die for.  The seared ahi tuna was like butter . . . amazing.

I went back again in February for happy hour, sat at the outdoor bar.  Bartender was impeccable, drinks were great, service fantastic.
label: 2


## Baseline model: Error Analysis

based on baseline model, the model shows decent performance but has greatest difficulty classifying rating category 3. model may be improved by examining class imbalance issues, addressing CountVectorizer limitations, etc.

In [None]:
# check class imbalance
print(Counter(train_labels))

# oversampling w/ replacement (minority labels)
train_df=pd.DataFrame(
    {'train_text': train_data,
     'train_labels': train_labels})

train_class0=train_df[train_df.train_labels==0]
train_class1=train_df[train_df.train_labels==1]
train_class2=train_df[train_df.train_labels==2]
train_oversampled_class0=resample(train_class0, replace=True, n_samples=len(train_class2), random_state=12)
train_oversampled_class1=resample(train_class1, replace=True, n_samples=len(train_class2), random_state=12)
train_oversampled_df=pd.concat([train_oversampled_class0, train_oversampled_class1, train_class2])

# get new train data, labels
train_oversampled_data=train_oversampled_df["train_text"].tolist()
train_oversampled_labels=train_oversampled_df["train_labels"].tolist()

# check class balance
print(Counter(train_oversampled_labels))

Counter({2: 419707, 0: 134640, 1: 89547})
Counter({0: 419707, 1: 419707, 2: 419707})


## Final model: Logistic Regression - Unigrams only

using tf-idf, train Logistic Regression model to identify words with largest weights for each rating category

In [None]:
# tf-idf
#vec=TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1,2), stop_words=total_stopwords, max_features=10000) #1-2grams
vec=TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1,1), stop_words=total_stopwords, max_features=10000) #unigram only
vec_train_data=vec.fit_transform(train_oversampled_data) # training w/ oversampled data
vec_test_data=vec.transform(test_data) 

# train Logistic Regression
logit=LogisticRegression(penalty='l2')
logit.fit(vec_train_data, train_oversampled_labels) # training w/ oversampled data
pred_labels=logit.predict(vec_test_data)
    
# assess model
f1=f1_score(test_labels, pred_labels, average="weighted") 
accuracy=accuracy_score(test_labels, pred_labels)
confusion=confusion_matrix(test_labels, pred_labels)
print("logistic regression f1 score: %.3f" %(f1))
print("logistic regression accuracy score: %.3f" %(accuracy))
print("logistic regression confusion matrix:")
print(confusion)

logistic regression f1 score: 0.811
logistic regression accuracy score: 0.797
logistic regression confusion matrix:
[[27133  5494  1046]
 [ 4594 13509  4355]
 [ 2735 14499 87609]]


In [None]:
# examine features w/ greatest weights

def top_features(coefs, num_feats):
    """
    Args: 
    coefs: array of shape (num_labels, vocab_size)
    num_feats: number of top features
    
    Prints:
    top num_feats features with great weights by
    rating category 
    """   
    
    feats=[]
    
    # identify top coefs per rating category
    top_indices=np.argsort(coefs, axis=1)[:,-num_feats:] # (num_labels, num_feats) 
    
    # display feature, weight
    for r in range(top_indices.shape[0]):
        label_feats=[]
        print("rating category %d" %(r))
        for c in range(top_indices.shape[1]):
            feat=vec.get_feature_names()[top_indices[r,c]]
            label_feats.append(feat)
            weight=round(coefs[r, top_indices[r,c]], 2)
            print(feat, weight)
        feats.append(label_feats)
        print("")
        
    return feats

feats=top_features(logit.coef_, num_feats=20)

rating category 0
nie 5.01
poor 5.09
bland 5.11
flavorless 5.14
kalt 5.16
inedible 5.19
disappointing 5.19
sick 5.26
unacceptable 5.31
disappointment 5.57
cockroach 5.64
tasteless 5.79
zero 5.85
awful 6.11
disgusting 6.37
waste 6.42
terrible 6.44
horrible 6.85
poisoning 7.39
worst 9.74

rating category 1
necessarily 3.46
good 3.48
average 3.71
allerdings 3.75
insgesamt 3.76
serviceable 3.8
pinch 3.95
alright 4.34
foodwise 4.46
okay 4.53
torn 4.55
stars 4.82
ok 4.93
feelings 5.21
decent 5.34
conflicted 5.41
aokay 6.09
3star 6.97
3stars 8.68
aok 14.7

rating category 2
heaven 5.29
hooked 5.54
heavenly 5.54
love 5.62
disappoint 5.65
wonderful 5.69
gem 5.79
highly 5.8
incredible 5.9
perfectly 5.94
fantastic 6.33
best 6.4
perfect 6.61
awesome 6.71
perfection 6.9
excellent 7.0
great 7.0
exceeded 7.71
amazing 8.11
delicious 8.99



## Explore top words

Use GloVe word embeddings to explore top features

In [None]:
# Install a few python packages using pip
from common import utils
utils.require_package("wget")      # for fetching dataset

In [None]:
# get word embeddings
hands=glove_helper.Hands(ndim=100)

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.100d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 100))


In [None]:
# compute cosine similarity

def find_nn_cos(v, Wv, k=10):
    """
    Args:
      v: (d-dimensional vector) word vector of interest
      Wv: (V x d matrix) word embeddings
      k: (int) number of neighbors to return
    
    Returns (nns, ds), where:
      nns: (k-dimensional vector of int), row indices of nearest neighbors, 
      which may include the given word
      similarities: (k-dimensional vector of float), cosine similarity of each 
      neighbor in nns
      
    """
    
    # compute cosine similarity
    cosin_sim = np.array([np.dot(v.T, Wv[r]) / (np.linalg.norm(v) * np.linalg.norm(Wv[r])) for r in range(Wv.shape[0])])
                         
    nns = np.argsort(-cosin_sim)[:k]
    similarities = cosin_sim[nns]
                         
    return (nns, similarities)


# identify nearest neighbors

def show_nns(hands, word, k=5):
    """
    Args:
    hands: word embeddings
    word: string
    k: number of nearest neighbors
    
    Prints:
    k nearest neighbors of specified word
    
    """
    word=word.lower()
    print("nearest neighbors for '{:s}'".format(word))
    v=hands.get_vector(word)
    for i, sim in zip(*find_nn_cos(v, hands.W, k)):
        target_word=hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")
    

In [None]:
# identify nearest neighbors for every word in feats
for label in feats:
    for word in label:
        show_nns(hands, word)
        

## Comparisons using GloVe

In [None]:
## Update these variables to the list(s) of words you're interested in for the following sections
words_of_interest = feats[1][10:] # words you want to compare
compare = feats[2] # words you want to compare them to
compare
# compare = ['torn', 'stars', 'ok', 'feelings', 'decent', 'conflicted', 'okay', '3star', '3stars', 'aok']
# compare = ['never', 'poor', 'bland', 'flavorless', 'cold', 'inedible', 'disappointing', 'sick',  'unacceptable', 
#            'disappointment', 'cockroach','tasteless', 'zero', 'awful', 'disgusting', 'waste', 'terrible', 'horrible',
#            'poisoning', 'worst']

# Pull vectors and IDs for the compare words from hands (no need to update)
compare_vectors = np.array([hands.get_vector(word) for word in compare])
compare_ids =hands.vocab.words_to_ids(compare)

In [None]:
# find nns given a comparison list

def show_nns_compare_list(hands_words, hands_vectors, word, k=5):
    """
    Args:
    hands_words: list of the words being investicated (strings)
    hands_vectors: word embeddings (vectors only) of a specified list
    word: string
    k: number of nearest neighbors
    
    Prints:
    k nearest neighbors of specified word in a specified list
    
    """
    word=word.lower()
    print("nearest neighbors for '{:s}'".format(word))
    v=hands.get_vector(word)
    for i, sim in zip(*find_nn_cos(v, hands_vectors, k)):
        target_word=hands_words[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")
    

In [None]:
## Compare a list of words to another list of words - find nearest neighbors
for word in words_of_interest:
    show_nns_compare_list(compare, compare_vectors, word)

nearest neighbors for 'nie'
1.000 : 'nie'
0.199 : 'kalt'
0.147 : 'cockroach'
0.135 : 'flavorless'
0.083 : 'inedible'

nearest neighbors for 'poor'
1.000 : 'poor'
0.569 : 'worst'
0.569 : 'sick'
0.498 : 'disappointing'
0.457 : 'terrible'

nearest neighbors for 'bland'
1.000 : 'bland'
0.457 : 'tasteless'
0.415 : 'disgusting'
0.402 : 'awful'
0.307 : 'poor'

nearest neighbors for 'flavorless'
1.000 : 'flavorless'
0.529 : 'inedible'
0.461 : 'tasteless'
0.302 : 'bland'
0.198 : 'disgusting'

nearest neighbors for 'kalt'
1.000 : 'kalt'
0.199 : 'nie'
0.147 : 'flavorless'
0.107 : 'inedible'
0.082 : 'tasteless'

nearest neighbors for 'inedible'
1.000 : 'inedible'
0.529 : 'flavorless'
0.480 : 'tasteless'
0.316 : 'disgusting'
0.301 : 'bland'

nearest neighbors for 'disappointing'
1.000 : 'disappointing'
0.621 : 'disappointment'
0.568 : 'worst'
0.517 : 'awful'
0.498 : 'poor'

nearest neighbors for 'sick'
1.000 : 'sick'
0.569 : 'poor'
0.514 : 'terrible'
0.494 : 'awful'
0.490 : 'horrible'

nearest neig

In [None]:
def similar_dissimilar_words(hands_words, Wv):
    """
    Args:
      hands_words: list of the words being investicated (strings)
      Wv: (V x d matrix) word embeddings
    
    Returns (most_similar_words, most_similar_score), (least_similar_words, least_similar_score)
      
    """
    k = 1
    
    # create each combination of 2 words
    combos = [combo for combo in itertools.combinations(Wv, 2)]
    combo_words = [combo for combo in itertools.combinations(hands_words, 2)]
    #print(combos)
    
    #calculate cosine similarity for each combo of 2 words
    cosin_sim = np.array([np.dot(combo[0].T, combo[1]) / (np.linalg.norm(combo[0]) * np.linalg.norm(combo[1])) 
                          for combo in combos])
    
    #identify most similar (highest cosine similarity) and dissimilar (lowest cosine similarity)
    most_similar_idx = int(np.argsort(-cosin_sim)[:k])
    most_similar_words = combo_words[most_similar_idx]
    most_similar_score = cosin_sim[most_similar_idx]
    
    least_similar_idx = int(np.argsort(cosin_sim)[:k])
    least_similar_words = combo_words[least_similar_idx]
    least_similar_score = cosin_sim[least_similar_idx]
    
                         
    return ((most_similar_words, most_similar_score), (least_similar_words, least_similar_score))

In [None]:
print(similar_dissimilar_words(compare, compare_vectors))

((('feelings', '3star'), 0.92197144), (('torn', 'conflicted'), -0.039125863))


In [None]:
def most_unique_words(hands_words, Wv):
    """
    Args:
      hands_words: list of the words being investicated (strings)
      Wv: (V x d matrix) word embeddings

    
    Returns (sum_smallest, max_smallest), where:
      sum_smallest: word where the sum of cosine similarity of all words in list is smallest 
      max_smallest: word where the max of all the cosine similarities in the list is smallest
      
    """
    k=1
    # sum together all cosine similarities - return word with smallest sum
    cosin_sim = dict()
        
    for vector, word in zip(Wv, hands_words):
        cosin_sim[word] = sum([np.dot(vector.T, Wv[r]) / (np.linalg.norm(vector) * np.linalg.norm(Wv[r])) 
                          for r in range(Wv.shape[0])]) 
      
    sum_smallest = min(cosin_sim, key=lambda k: cosin_sim[k])             
    
    # take max of all cosine similarities - return word with smallest max 
    cosin_sim_maxes = dict()
            
    for vector, word in zip(Wv, hands_words):
        cosin_sim_maxes[word] = max([np.dot(vector.T, Wv[r]) / (np.linalg.norm(vector) * np.linalg.norm(Wv[r])) 
                          for r in range(Wv.shape[0]) if word != hands_words[r]]) 
      
    max_smallest = min(cosin_sim_maxes, key=lambda k: cosin_sim_maxes[k])  
        
                         
    return (sum_smallest, max_smallest)

In [None]:
print(most_unique_words(compare,compare_vectors))

('exceeded', 'disappoint')


## K-means clustering

In [None]:
# create matrix
def create_mat(words, embeds):
    """
    Args:
    words: list of words to cluster
    embeds: matrix of word embeddings
    
    Returns:
    mat: (words_len, embed_dim) matrix
    of word embeddings
    """
    mat=np.zeros((len(words), embeds.shape[1])) # (20,100)

    for r in range(mat.shape[0]):
        mat[r,]=hands.get_vector(words[r])
        
    return mat


def show_clusters(words, clusters):
    all_clusters=[[] for num in range(len(set(clusters)))]
    
    for i, c in enumerate(clusters):
        all_clusters[c].append(words[i])
        
    for j, c in enumerate(all_clusters):
        print("cluster %d" %(j+1))
        for w in all_clusters[j]:
            print(w)
        print("")

    

In [None]:
#redefining compare to test different things       
#compare = [word for n in range(len(feats)) for word in feats[n]]


In [None]:
# choose # of clusters, create embeddings
num_clusters=4
word_embeds=create_mat(compare, hands)

In [None]:
# k-means clustering with nltk


kclusterer=KMeansClusterer(num_clusters, distance=cosine_distance, repeats=300)
assigned_clusters=kclusterer.cluster(word_embeds, assign_clusters=True)

show_clusters(compare, assigned_clusters)

cluster 1
disappointing
unacceptable
disappointment
terrible

cluster 2
poor
bland
zero
worst

cluster 3
never
cold
sick
awful
disgusting
waste
horrible
poisoning

cluster 4
flavorless
inedible
cockroach
tasteless



In [None]:
# k-means clustering with sklearn

skmeans_model = KMeans(init='k-means++',n_clusters=num_clusters, n_init=5000)
skmeans_model.fit(word_embeds)

cluster_labels = skmeans_model.labels_

show_clusters(compare, cluster_labels)

cluster 1
cold
sick
zero
waste
poisoning

cluster 2
unacceptable
awful
disgusting
terrible
horrible

cluster 3
never
poor
disappointing
disappointment
worst

cluster 4
bland
flavorless
inedible
cockroach
tasteless



# POS Tagging

In [None]:
#create dictionary of POS tags in corpus, using nltk's pos_tag 

tag_dict = defaultdict(lambda: defaultdict(lambda: 0))
for review in text:
    for tup in pos_tag(review.split()):
        input_token = tokenize_text(tup[0])
        if input_token:
            can_token = canonicalize_tokens(input_token)[0]
            tag_dict[can_token][tup[1]] += 1



In [None]:
def likely_POS(word):
  '''Return count-based likely POS from tag_dict
  If word missing from corpus, use pos_tag to tag (out of context)'''
  
    if word in tag_dict.keys():
        return max(tag_dict[word], key=tag_dict[word].get)
    else:
        return pos_tag([word])[0][1]

In [None]:
#Return top features that are just a specified part of speech

def top_noun_features(coefs, num_feats):
    """
    Args: 
    coefs: array of shape (num_labels, vocab_size)
    num_feats: number of top features
    
    Prints:
    top num_feats features with great weights by
    rating category 
    """   
    
    feats=[]
    
    # identify top coefs per rating category
    top_indices=np.argsort(coefs, axis=1)[:,-num_feats:] # (num_labels, num_feats) 
    
    # display feature, weight
    for r in range(top_indices.shape[0]):
        label_feats=[]
        print("rating category %d" %(r))
        for c in range(top_indices.shape[1]):
            feat=vec.get_feature_names()[top_indices[r,c]]
            if likely_POS(feat)[0:2] == "NN": #nouns
#             if likely_POS(feat)[0:2] == "VB": #verbs
#             if likely_POS(feat)[0:2] in ["JJ", "RB"]: #adjectives and adverbs
                label_feats.append(feat)
                weight=round(coefs[r, top_indices[r,c]], 2)
                print(likely_POS(feat), feat, weight)
        feats.append(label_feats)
        print("")
        
    return feats

noun_feats=top_noun_features(logit.coef_, num_feats=70)

In [None]:
#POS counts using just nltk 

tagged_feats = list()

for i, feat in enumerate(feats):
    tagged_feats.append(pos_tag(feat))
    
tagged_feats

pos_list = [tagged_feats[i][n][1] for i in range(len(tagged_feats)) for n in range(len(tagged_feats[i]))]
pos_set = set(pos_list)

cnt = Counter()
for word in pos_list:
     cnt[word] += 1

cnt 

In [None]:
# POS counter using revised tag_dict

tagged_feats2 = list()
flat_feats = list()


for i in range(len(feats)):
    for feat in feats[i]:
        flat_feats.append(feat)

count2 = Counter()
for feat in flat_feats:
#     print(feat)
    count2[likely_POS(feat)] += 1
    
count2