In [1]:
import os
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import svm
import pandas as pd
from pandas import DataFrame, Series

ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, 'Raw_debates')
RESULT_DIR = os.path.join(ROOT_DIR, 'candidate_lines')

candidates = {'CLINTON', 'SANDERS', 'TRUMP', 'RUBIO', 'CRUZ'}
cand_list = sorted(candidates)

stops = set(stopwords.words("english"))

In [2]:
word_dic = {'action': 'act',
               'agreement': 'agree',
               'americans': 'american',
               'asked': 'ask',
               'asking': 'ask',
               'going': 'go',
               'states': 'state',
               'working': 'work',
               'millions': 'million',
               'bringing': 'bring',
               'businesses': 'business',
               'candidates': 'candidate',
               'children': 'child',
               'comes': 'come',
               'coming': 'come',
               'companies': 'company',
               'countries': 'country',
               'deals': 'deal',
               'economic': 'economy',
               'families': 'family',
               'fighting': 'fight',
               'gets': 'get',
               'getting': 'get',
               'goes': 'go',
               'got': 'get',
               'groups': 'group',
               'guns': 'gun',
               'happened': 'happen',
               'happening': 'happen',
               'helped': 'help',
               'issues': 'issue',
               'knows': 'know',
               'laws': 'law',
               'lives': 'live',
               'living': 'live',
               'making': 'make',
               'needs': 'need',
               'passed': 'pass',
               'problems': 'problem',
               'putting': 'put',
               'really': 'real',
               'republicans': 'republican',
               'running': 'run',
               'saying': 'say',
               'said': 'say',
               'seeing': 'see',
               'seen': 'see',
               'started': 'start',
               'supported': 'support',
               'taking': 'take',
               'talked': 'talk',
               'talking': 'talk',
               'terrorists': 'terrorist',
               'terrorism': 'terrorist',
               'things': 'thing',
               'trying': 'try',
               'used': 'use',
               'using': 'use',
               'voted': 'vote',
               'wages': 'wage',
               'wanted': 'want',
               'wants': 'want',
               'building': 'build',
               'called': 'call',
               'came': 'come',
               'communities': 'community',
               'costs': 'cost',
               'deffence': 'deffend',
               'difference': 'different',
               'drugs': 'drug',
               'gave': 'give',
               'given': 'give',
               'gone': 'go',
               'higher': 'high',
               'highest': 'high',
               'interests': 'interest',
               'jobs': 'job',
               'longer': 'long',
               'looked': 'look',
               'looking': 'look',
               'lost': 'lose',
               'made': 'make',
               'means': 'mean',
               'paying': 'pay',
               'planned': 'plan',
               'programs': 'program',
               'raising': 'raise',
               'reasons': 'reason',
               'ringing': 'ring',
               'says': 'say',
               'saw': 'see',
               'wealthy': 'wealth',
               'worked': 'work',
               'years': 'year'}

In [3]:
def get_raw_text(cand_name):
    file_name = '{0}.txt'.format(cand_name)
    file_path = os.path.join(RESULT_DIR, file_name)
    with open(file_path, 'rb') as f:
        return f.read()

def raw_to_blocks(cand_name, raw_text, delimiter='\r\n\r\n', min_len=200):
    # Delete candidate's name
    names_removed = raw_text.replace(cand_name + ':', '')
    
    # Split the text into paragraphs
    splitted = names_removed.split(delimiter)

    # Gather only long enough paragraphs
    rets = [paragraph for paragraph in splitted if len(paragraph) > min_len]
    return rets

# Replace words that are virtually same into one word. 
def hard_code_process(text):
    separate_word = ' {0} '
    text = separate_word.format(text)
    
    for key, value in word_dic.iteritems():
        key = separate_word.format(key)
        value = separate_word.format(value)
        text = text.replace(key, value)
    return text

def process_paragraph(paragraph):
    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", paragraph) 
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    rets = ' '.join(meaningful_words)
    return hard_code_process(rets)

def get_processed_lines(cand_name, delimiter='\r\n\r\n', min_len=200):
    raw_text = get_raw_text(cand_name)
    blocks = raw_to_blocks(cand_name, raw_text, 
                           delimiter=delimiter, min_len=min_len)
    return map(process_paragraph, blocks)

def get_vectorizer(cand_lines, max_features=1000):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = max_features)
    
    concatenated = []
    for cand_name, lines in cand_lines.iteritems():
        concatenated += lines
    
    vectorizer.fit(concatenated)
    return vectorizer

In [5]:
# Examples
cand_lines = {c: get_processed_lines(c) for c in candidates}
vectorizer = get_vectorizer(cand_lines, 500)

vocab = vectorizer.get_feature_names()
clinton_train = vectorizer.transform(cand_lines['CLINTON']).toarray()

In [7]:
# Split training data and test data
def test_train_split(cand_lines, test_ratio=0.25):
    cands = cand_lines.keys()
    test_data = dict.fromkeys(cands)
    train_data = dict.fromkeys(cands)
    for c in cands:
        lines = cand_lines[c]
        l = len(lines)
        test_len = int(l * test_ratio)
        test_indices = set(np.random.choice(l, test_len, replace=False))
        train_indices = set(range(l)) - test_indices
        test_data[c] = list(np.take(lines, list(test_indices)))
        train_data[c] = list(np.take(lines, list(train_indices)))
    return test_data, train_data

def normalize(data):
    rets = dict()
    for c in data:
        d = data[c].astype(float)
        _sum = d.sum(axis=1)
        _sum[_sum==0] = 1
        rets[c] = (d.T / _sum).T
    return rets

In [14]:
# Process the data

cand_lines = {c: get_processed_lines(c) for c in candidates}
test_data, train_data = test_train_split(cand_lines, 0.25)

vectorizer = get_vectorizer(train_data, 500)

for c in train_data:
    train_data[c] = vectorizer.transform(train_data[c]).toarray()
    test_data[c] = vectorizer.transform(test_data[c]).toarray()

train_data = normalize(train_data)
test_data = normalize(test_data)

train_x, test_x = None, None
train_y, test_y = [], []

for c in train_data:
    i  = cand_list.index(c)
    if train_x is None:
        train_x = train_data[c]
    else:
        train_x = np.concatenate((train_x, train_data[c]))
    train_y += [i] * len(train_data[c])
    
for c in test_data:
    i  = cand_list.index(c)
    if test_x is None:
        test_x = test_data[c]
    else:
        test_x = np.concatenate((test_x, test_data[c]))
    test_y += [i] * len(test_data[c])

In [13]:
set(train_y)

{0, 1, 2, 3, 4}

In [9]:
train_x.shape

(1553, 100)

In [15]:
# Optimize C and gamma

_C = []
_gamma = []
_train = []
_test = []

for C in [0.01, 0.1, 1, 10, 100]:
    for gamma in [0.01, 0.1, 1, 10, 100]:
        clf = svm.SVC(C=C, gamma=gamma)
        clf.fit(train_x, train_y)
        _C.append(C)
        _gamma.append(gamma)
        _train.append(clf.score(train_x, train_y))
        _test.append(clf.score(test_x, test_y))

In [16]:
DataFrame({'C': _C, 'gamma': _gamma, 'train_score': _train, 'test_score': _test})

Unnamed: 0,C,gamma,test_score,train_score
0,0.01,0.01,0.21165,0.21056
1,0.01,0.1,0.21165,0.21056
2,0.01,1.0,0.21165,0.21056
3,0.01,10.0,0.21165,0.21056
4,0.01,100.0,0.21165,0.21056
5,0.1,0.01,0.21165,0.21056
6,0.1,0.1,0.21165,0.21056
7,0.1,1.0,0.21165,0.21056
8,0.1,10.0,0.32233,0.364456
9,0.1,100.0,0.21165,0.21056


In [17]:
vocab

[u'able',
 u'absolutely',
 u'across',
 u'act',
 u'actually',
 u'add',
 u'administration',
 u'advantage',
 u'affordable',
 u'african',
 u'ago',
 u'agree',
 u'air',
 u'allies',
 u'allow',
 u'almost',
 u'along',
 u'already',
 u'also',
 u'always',
 u'amendment',
 u'america',
 u'american',
 u'amnesty',
 u'another',
 u'answer',
 u'anybody',
 u'anyone',
 u'anything',
 u'applause',
 u'army',
 u'around',
 u'ask',
 u'assad',
 u'attack',
 u'attacks',
 u'away',
 u'back',
 u'bad',
 u'banks',
 u'barack',
 u'based',
 u'become',
 u'begin',
 u'behind',
 u'believe',
 u'bell',
 u'best',
 u'better',
 u'big',
 u'biggest',
 u'bill',
 u'billion',
 u'bit',
 u'border',
 u'bottom',
 u'break',
 u'bring',
 u'budget',
 u'build',
 u'built',
 u'bush',
 u'business',
 u'buy',
 u'call',
 u'campaign',
 u'candidate',
 u'cannot',
 u'care',
 u'case',
 u'century',
 u'certainly',
 u'chance',
 u'change',
 u'chief',
 u'child',
 u'china',
 u'choice',
 u'chris',
 u'city',
 u'class',
 u'clear',
 u'clinton',
 u'coalition',
 u'coll