# Polarizing Words

In [1]:
import pickle
import dill
import pandas as pd
from heapq import nlargest, nsmallest
import matplotlib.pyplot as plt

In [2]:
def most_polarizing_words(model, countVectorizer, n):
    print('coefficient shape', model.coef_.shape)
    print('coefficients', model.coef_[:10])
    if model.coef_.shape[0] == 1:
        coefficients = [(i,c) for i,c in enumerate(model.coef_[0])]
    else:
        coefficients = [(i,c) for i,c in enumerate(model.coef_[1])]

    id2word = countVectorizer.get_feature_names()
    strongest_positive = [(id2word[i],coef) for i,coef in nlargest(n, coefficients[::], key=lambda x: x[1])]
    strongest_negative = [(id2word[i],coef) for i,coef in nsmallest(n, coefficients, key=lambda x: x[1])]
    print('\n Most positive words were:') 
    for (word,coef) in strongest_positive:
        print('{}: {}'.format(word, coef))
    print('\n Most negative words were:') 
    for (word,coef) in strongest_negative:
        print('{}: {}'.format(word, coef))
    return strongest_positive, strongest_negative

In [3]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)
countVectorizer = dill.load(open('countVectorizer.pk', 'rb'))
tfidfVectorizer = dill.load(open('tfidfVectorizer.pk', 'rb'))
print(countVectorizer)
print(tfidfVectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=10000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)


In [5]:
import math
import numpy as np

n = 20
threshold = 0
counts = np.sum(model.feature_count_,axis=0)
id2word = np.array(countVectorizer.get_feature_names())[counts > threshold]
coefficients = (model.feature_log_prob_[0] / model.feature_log_prob_[1])[counts > threshold]
sorted_coef = (sorted([(id2word[i],c) for i,c in enumerate(coefficients)], key=lambda x:x[1]))

for word in sorted_coef[-n:][::-1]:
    print(word)
sorted_coef[:n]

('india', 1.8572213605885772)
('modi', 1.698891131513202)
('delhi', 1.6528593577811521)
('minister', 1.6382838348175985)
('indiafightscorona', 1.6245966578459727)
('total', 1.602195243876359)
('pakistan', 1.5973227569728095)
('washyourhands', 1.5930055454912195)
('indians', 1.585537999142344)
('deaths', 1.5681793789843586)
('against', 1.5654016285583336)
('coronavid19', 1.5520839635756414)
('coronavirusoutbreak', 1.5487192837681087)
('religion', 1.5451295944119556)
('nigeria', 1.5412998278420997)
('coronavirusupdates', 1.5314208643971856)
('maharashtra', 1.529854639434887)
('muslims', 1.5289848930921794)
('govt', 1.5257559003217753)
('hai', 1.5210578695352168)


[('ima', 0.6168695580738436),
 ('bitch', 0.6214128806929923),
 ('shit', 0.6431218727875281),
 ('niggas', 0.645962809317269),
 ('bday', 0.6501839164502041),
 ('tattoo', 0.6542194944724838),
 ('fuck', 0.6583893750551915),
 ('im', 0.6607261573393413),
 ('summer', 0.6619628791609673),
 ('ruining', 0.662686771177112),
 ('allergies', 0.6629529723451634),
 ('cardi', 0.664393537029292),
 ('imma', 0.6657470976544845),
 ('nigga', 0.6661107583072404),
 ('ruined', 0.6703907990363203),
 ('dawg', 0.6720469169750934),
 ('homies', 0.6736999025348349),
 ('tryna', 0.6763186827998814),
 ('birthday', 0.6766771450440043),
 ('aint', 0.6766789944374383)]

# Analysis using model

In [None]:

# store vectorizers and model
dill.dump(countVectorizer, open('countVectorizer.pk', 'wb'))
dill.dump(tfidfVectorizer, open('tfidfVectorizer.pk', 'wb'))
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# load vectorizers and model
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)
countVectorizer = dill.load(open('countVectorizer.pk', 'rb'))
tfidfVectorizer = dill.load(open('tfidfVectorizer.pk', 'rb'))