In [1]:
import glob
import re
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import *
import time
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from scipy import sparse
from sklearn import metrics
import _pickle as cPickle
import pandas as pd
import matplotlib.pyplot as plt
import csv



In [39]:
def preprocess(sentence):
    #take out "\n", which snuck its way into every downloaded sentence
    sentence = re.sub('\n', '', sentence)
    sentence = re.sub(r"[\W+_]", ' ', sentence)
    sentence = re.sub(r"[^a-zA-Z0-9]", ' ', sentence)
    #limit sentence to 100 words
    if len(sentence.split(' ')) > 100:
        sentence = ' '.join(sentence.split(' ')[0:99])
    if len(sentence.split()) > 2:
        return sentence
    else:
        return None

In [40]:
#neutral, liberal, conservative
sentences_separate = [[], []]
folders = ['../../reddit/lib_posts/', '../../reddit/con_posts/']
article_lengths = []

for i in range(2):
    folder = folders[i]
    
    #max number of sentences in article
    article_length_limit = 10000
    
    count = 1
    for fname in glob.iglob(folder+'*.txt', recursive=True):
        
        article_length = 0
        for line in open(fname):
            for s in line.split('.'):
                s = preprocess(s)
                if s is not None:
                    sentences_separate[i].append(s)
                    count += 1
                    article_length += 1
        article_lengths.append(article_length)

In [41]:
lib, con = sentences_separate[0], sentences_separate[1]
print(len(lib))
print(len(con))

98966
285676


In [42]:
np.random.seed(0)
shuffle=np.random.permutation(np.arange(20000))
con_short = list(np.array(con)[shuffle])
lib_short = list(np.array(lib)[shuffle])
len(lib_short)
len(con_short)

20000

In [43]:
sentences = []
labels = []

for i in range(len(lib_short)):
    sentences.append(lib_short[i])
    labels.append('liberal')
    
for i in range(len(con_short)):
    sentences.append(con_short[i])
    labels.append('conservative')
    
#for i in range(len(neu)):
#    sentences.append(neu[i])
#    labels.append('neutral')

In [44]:
lens = [len(x) for x in sentences]
print(min(lens))
print(max(lens))

6
626


In [45]:
len(sentences)

40000

## Split into training/test set, vectorize, and run LR model

In [46]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, random_state=2)
print(len(train_sentences))
print(len(test_sentences))

30000
10000


In [47]:
train_labels[2432]

'liberal'

In [48]:
cv = CountVectorizer(min_df=5, ngram_range=(0,2))
cv_train_sentences = cv.fit_transform(train_sentences)
cv_test_sentences = cv.transform(test_sentences)
print(len(cv.get_feature_names()))

23746


In [49]:
for C in [.5]:
    start = time.time()
    lr_base = LogisticRegression(C = C)
    lr_base.fit(cv_train_sentences, train_labels)
    preds = lr_base.predict(cv_test_sentences)
    print(C)
    print(classification_report(test_labels, preds))
    print("\nf1=score: "+str(metrics.f1_score(test_labels, preds, average='weighted')))
    print((time.time()-start), 's')

0.5
              precision    recall  f1-score   support

conservative       0.67      0.67      0.67      4995
     liberal       0.67      0.66      0.67      5005

 avg / total       0.67      0.67      0.67     10000


f1=score: 0.666598879774
2.2331795692443848 s


## Examine model weights, scores, etc.

In [16]:
cv_featurenames = cv.get_feature_names()



weightIndeces_con = np.argsort(lr_base.coef_[0])[-20:]
weightIndeces_lib = np.argsort(lr_base.coef_[0])[0:20]
#weightIndeces_neu = np.argsort(lr_base.coef_[2])[-20:]
#weightIndeces_neu_neg = np.argsort(lr_base.coef_[2])[0:20]

print('Top 10 features for liberal:')
for index in weightIndeces_con: 
    print(cv_featurenames[index])

print('\nTop 10 features for conservative:')
for index in weightIndeces_lib:
    print(cv_featurenames[index])
    


Top 10 features for liberal:
asthma
alec
raphel
nader
stein
slager
march 2018
rsvp
tigerswan
arendt
deace
coon
walden
bundy
flint
marijuana
maher
irv
ailes
kobach

Top 10 features for conservative:
ddt
tb
venezuela
rahami
patman
law number
malaria
beck
espn
mattis
boehner
merkel
haley
lopez
fillon
kasparov
wnd
caroline
kashiwagi
authoritarians


## attempting to apply model to IBC data

In [51]:
ibc_frame=pd.read_csv('../full_ibc_sentences.csv')


In [52]:
ibc_frame.head()

Unnamed: 0,label,sentence
0,Liberal,Forcing middle-class workers to bear a greater...
1,Liberal,Because it would not be worthwhile to bring a ...
2,Liberal,"Indeed , Lind argues that high profits and hig..."
3,Liberal,"In fairness , it should be noted that he devot..."
4,Liberal,Psychological tactics are social control techn...


In [53]:
ibc_frame = ibc_frame[ibc_frame['label']!='Neutral']

In [54]:
ibc_frame.groupby('label').count()

Unnamed: 0_level_0,sentence
label,Unnamed: 1_level_1
Conservative,1701
Liberal,2025


In [55]:
ibc_labels = list(ibc_frame['label'])
ibc_labels = [label.lower() for label in ibc_labels]
ibc_sentences = list(ibc_frame['sentence'])
#ibc_sentences = [preprocess(sentence) for sentence in ibc_sentences]
cv_ibc_sentences = cv.transform(ibc_sentences)
print(len(ibc_labels))
print(len(ibc_sentences))

3726
3726


In [56]:
IBC_preds = lr_base.predict(cv_ibc_sentences)
print(classification_report(ibc_labels, IBC_preds))
print("\nf1=score: "+str(metrics.f1_score(ibc_labels, IBC_preds, average='weighted')))
print(sum(IBC_preds==ibc_labels)/len(ibc_labels))

              precision    recall  f1-score   support

conservative       0.50      0.49      0.49      1701
     liberal       0.58      0.60      0.59      2025

 avg / total       0.54      0.55      0.54      3726


f1=score: 0.544652749084
0.545356951154


In [74]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(ibc_sentences, ibc_labels, random_state=1)
print(len(train_sentences))
print(len(test_sentences))

2794
932


In [75]:
#IBC itself
cv = CountVectorizer(min_df=2, ngram_range=(0,1))
cv_train_sentences = cv.fit_transform(train_sentences)
cv_test_sentences = cv.transform(test_sentences)
print(len(cv.get_feature_names()))

6023


In [76]:

start = time.time()
lr_base = LogisticRegression()
lr_base.fit(cv_train_sentences, train_labels)
preds = lr_base.predict(cv_test_sentences)
print(C)
print(classification_report(test_labels, preds))
print("\nf1=score: "+str(metrics.f1_score(test_labels, preds, average='weighted')))
print((time.time()-start), 's')

0.5
              precision    recall  f1-score   support

conservative       0.57      0.55      0.56       420
     liberal       0.64      0.66      0.65       512

 avg / total       0.61      0.61      0.61       932


f1=score: 0.606504277422
0.06196928024291992 s


In [77]:
np.mean(preds==test_labels)

0.60729613733905574