In [3]:
import re
import numpy as np
from common import utils
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import *

In [2]:
#from author who provided us with dataset: import data, show some examples
import _pickle as cPickle

if __name__ == '__main__':
    [lib, con, neutral] = cPickle.load(open('ibcData.pkl', 'rb'))

    # how to access sentence text
    print('Liberal examples (out of ', len(lib), ' sentences): ')
    for tree in lib[0:5]:
        print(tree.get_words())

    print('\nConservative examples (out of ', len(con), ' sentences): ')
    for tree in con[0:5]:
        print(tree.get_words())

    print('\nNeutral examples (out of ', len(neutral), ' sentences): ')
    for tree in neutral[0:5]:
        print(tree.get_words())

    # how to access phrase labels for a particular tree
    ex_tree = lib[0]

    print('\nPhrase labels for one tree: ')

    # see treeUtil.py for the tree class definition
    for node in ex_tree:

        # remember, only certain nodes have labels (see paper for details)
        if hasattr(node, 'label'):
            print(node.label, ': ', node.get_words())

Liberal examples (out of  2025  sentences): 
Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
Because it would not be worthwhile to bring a case for $ 30.22 , the arbitration clause would , as a practical matter , deny the Concepcions any relief and , more important , eliminate a class action that might punish AT&T for its pattern of fraudulent behavior .
Indeed , Lind argues that high profits and high wages reinforce each other because workers then have the wherewithal to buy the products they are making .
In fairness , it should be noted that he devotes an entire chapter to New York Times political columnist Maureen Dowd , a liberal who makes much of the outsized rivalries , jealousies , and personalities that dominate American politics .
Psychological tactics are social control techniques that operate at the level of the mind , with th

In [3]:
#two arrays - sentences and lib/con/neu labels

sentences = []
labels = []

for i in range(len(lib)):
    sentences.append(lib[i].get_words())
    labels.append('liberal')
    
for i in range(len(con)):
    sentences.append(con[i].get_words())
    labels.append('conservative')
    
for i in range(len(neutral)):
    sentences.append(neutral[i].get_words())
    labels.append('neutral')

In [4]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, random_state=2)
print(len(train_sentences))
print(len(test_sentences))

3244
1082


In [5]:
cv = CountVectorizer(min_df=10, ngram_range=(0,3))
cv_train_sentences = cv.fit_transform(train_sentences)
cv_test_sentences = cv.transform(test_sentences)

In [6]:
tv = TfidfVectorizer()
tv_train_sentences = tv.fit_transform(train_sentences)
tv_test_sentences = tv.transform(test_sentences)

In [7]:
lr = LogisticRegression()
lr.fit(cv_train_sentences, train_labels)
preds = lr.predict(cv_test_sentences)

In [8]:
lr2 = LogisticRegression()
lr2.fit(tv_train_sentences, train_labels)
preds2 = lr2.predict(tv_test_sentences)

In [9]:
mnb = MultinomialNB(.001)
mnb.fit(cv_train_sentences, train_labels)
preds3 = mnb.predict(cv_test_sentences)

In [10]:
mnb2 = MultinomialNB(.001)
mnb2.fit(tv_train_sentences, train_labels)
preds4 = mnb2.predict(tv_test_sentences)

In [13]:
parameters = {'alpha':[.0001,.001,.01,.1,1,10,100]}
parameters2 = {'C':[1, 10, 100]}
mnb = MultinomialNB()
lr = LogisticRegression()
gs_mnb = GridSearchCV(mnb, parameters)
gs_mnb.fit(cv_train_sentences, train_labels)
#print(gs_mnb.cv_results_)
print(gs_mnb.best_params_)
gs_lr = GridSearchCV(lr, parameters2)
gs_lr.fit(cv_train_sentences, train_labels)
#print(gs_lr.cv_results_)
print(gs_lr.best_params_)

{'alpha': 1}
{'C': 1}


In [12]:
print(classification_report(test_labels, preds))
print(classification_report(test_labels, preds2))
print(classification_report(test_labels, preds3))
print(classification_report(test_labels, preds4))

              precision    recall  f1-score   support

conservative       0.51      0.54      0.52       424
     liberal       0.61      0.61      0.61       526
     neutral       0.23      0.18      0.20       132

 avg / total       0.52      0.53      0.52      1082

              precision    recall  f1-score   support

conservative       0.56      0.53      0.55       424
     liberal       0.59      0.76      0.66       526
     neutral       0.33      0.01      0.01       132

 avg / total       0.55      0.58      0.54      1082

              precision    recall  f1-score   support

conservative       0.53      0.55      0.54       424
     liberal       0.64      0.63      0.63       526
     neutral       0.30      0.26      0.28       132

 avg / total       0.55      0.56      0.55      1082

              precision    recall  f1-score   support

conservative       0.51      0.50      0.51       424
     liberal       0.57      0.68      0.62       526
     neutral      

In [8]:
cv_featurenames = cv.get_feature_names()

largestWeightedWords_dem = []
largestWeightedWords_rep = []

weightIndeces_con = np.argsort(lr.coef_[0])[-10:]
weightIndeces_lib = np.argsort(lr.coef_[1])[-10:]
weightIndeces_neu = np.argsort(lr.coef_[2])[-10:]

print('Top 10 features for conservative:')
for index in weightIndeces_con: 
    print(cv_featurenames[index])

print('\nTop 10 features for liberal:')
for index in weightIndeces_lib: 
    print(cv_featurenames[index])
    
print('\nTop 10 features for neutral:')
for index in weightIndeces_neu: 
    print(cv_featurenames[index])

Top 10 features for conservative:
liberal
payments
we have
morality
businesses
real
agencies
illegal
nuclear
liberals

Top 10 features for liberal:
corporations
republican
bottom
ecological
to protect
ignore
bankers
consumer
way that
have the

Top 10 features for neutral:
hostile
drive
happy
for an
nature
fix
theory
secure
require
found


In [11]:
print(train_sentences[1])
print(cv_train_sentences[1])

And while multiple embryo transfer is no more likely to result in a baby for women under 36 -- it 's just more likely to result in multiples -- for older women it raises what can be pretty bleak odds , making the client happy and bolstering the clinic 's success rates .
  (0, 0)	47
  (0, 105)	2
  (0, 148)	1
  (0, 236)	1
  (0, 331)	1
  (0, 332)	1
  (0, 767)	2
  (0, 894)	1
  (0, 988)	2
  (0, 1076)	1
  (0, 1080)	1
  (0, 1088)	2
  (0, 1112)	1
  (0, 1184)	2
  (0, 1185)	2
  (0, 1232)	1
  (0, 1293)	2
  (0, 1296)	2
  (0, 1340)	1
  (0, 1429)	1
  (0, 1580)	1
  (0, 1661)	1
  (0, 1725)	2
  (0, 1726)	2
  (0, 1899)	1
  (0, 1991)	2
  (0, 2187)	2
  (0, 2289)	1
  (0, 2400)	1
  (0, 2419)	1
  (0, 2452)	2
