# W266 Term Project

## Identification of Future Societal Events

### John Chiang, Vincent Chu

In [85]:
import nltk 
from nltk import tokenize
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords

import numpy as np
import os

# Regular expression
import re

# Numerical manipulation libraries.
from scipy import stats
import scipy.optimize

# Helper libraries
import segment
import utils
import vocabulary

# Plotly imports.
import plotly.offline as plotly
plotly.offline.init_notebook_mode()
import plotly.graph_objs as go

# scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [11]:
#############################################################
# Globals
#############################################################

root_dir = '/home/vslchu/w266/project/data/eventstatus_eng/'

In [3]:
##############################################################
# Create overall corpus for all the annotated files in 
# English
##############################################################

eventstatus_corpus = PlaintextCorpusReader(root_dir, ".*\.txt")
eventstatus_corpus_text  = nltk.Text(eventstatus_corpus.words())

In [4]:
##############################################################
# EDA - Using code from Week 2 Materials
##############################################################

# "canonicalize_word" performs a few tweaks to the token stream of
# the corpus.  For example, it replaces digits with DG allowing numbers
# to aggregate together when we count them below
token_feed = (utils.canonicalize_word(w) for w in eventstatus_corpus.words())

# Collect counts of tokens and assign wordids
vocab = vocabulary.Vocabulary(token_feed)
print "Vocabulary size: %d" % vocab.size

# Print out some (debugging) statistics to make sure everything went
# as we expected.
print "Most common unigrams:"
for word, count in vocab.unigram_counts.most_common(10):
    print "\"%s\": %d" % (word, count)

Vocabulary size: 50960
Most common unigrams:
",": 117118
"the": 106509
".": 82224
"DG": 56611
"to": 45134
"=": 43996
"of": 43618
"a": 39062
"in": 38773
"and": 37846


In [5]:
##############################################################
# EDA (cont') - Using code from Week 2 Materials
##############################################################

words, counts = zip(*vocab.unigram_counts.most_common(20))
data = [go.Bar(x=words, y=counts, name='Counts')]
plotly.iplot(data)

In [6]:
##############################################################
# EDA (cont') - Using code from Week 2 Materials
##############################################################

# This next line splits the pairs of <word, count> in the vocabulary into two lists:
# 1.  a list of words (types)
# 2.  a list of counts (per type)
# with the property that the ith word in the list has its corresponding count in the ith counts.
words, counts = zip(*vocab.unigram_counts.most_common(vocab.size))
counts = np.array(counts, dtype=float)  # Avoid integer math.
rank = 1 + np.arange(len(counts))  # rank is an array of [1, 2, 3, 4, ..., num_types]
N = np.sum(counts)  # N = total # of tokens seen.
p = counts / N  # p is an array the length of `words`.  #_times_word_seen / total_#_words

# Fit a power law curve to the histogram above.
# Optimize least-squares in log space.
fit_func = lambda (a, b): (np.log(a*p) - np.log(a * rank**b))
(a,b), _ = scipy.optimize.leastsq(fit_func, np.array([p[0], -1.0]))
print u"Power law exponent: \u03B2 = %.02f" % b
p_pred = (a * rank**b) / sum(a * rank**b)  # predict probabilities
c_pred = N * p_pred  # predict counts

# Plot counts, with fit curve.
nplot = 100
data = [go.Bar(x=words[:nplot], y=counts[:nplot], name='Counts'),
        go.Scatter(x=words[:nplot], y=np.round(c_pred)[:nplot], name="Zipf Fit")]
layout=go.Layout(yaxis=dict(range=[0,1.2*max(counts)]))
fig = go.Figure(data=data, layout=layout)
plotly.iplot(fig)

Power law exponent: β = -1.36


In [13]:
##############################################################
# Functions
##############################################################

def process_tokenized_annotated_file(tokens):
    events=[]
    events_tags=[]

    chunk=[]
    chunk_tag=[]

    chunk_on=False
    bars_on=False

    for i in range(len(tokens)):        
        if tokens[i]=='>':
            if tokens[i-1]=='CHUNK':
                chunk_on=True
            elif tokens[i-1]=='/CHUNK':
                chunk_on=False
                chunk=chunk[:-2]
                events.append(chunk)
                events_tags.append(chunk_tag)
                chunk=[]
                chunk_tag=[]
        elif chunk_on==True:
            if tokens[i]=='|||':
                bars_on=True
            elif tokens[i].find("NO=") > -1:
                bars_on=False
            elif bars_on:  
                if tokens[i-1]=='|||':
                    chunk_tag.append(tokens[i])
            else:
                chunk.append(tokens[i])
    
    return (events, events_tags)

def process_annotated_file(file_path):
    f=open(file_path,'rU')
    raw=f.read()
    tokens=tokenize.word_tokenize(raw)
    return process_tokenized_annotated_file(tokens)

def process_annotated_files_dir(dir):
    corpus_text=[]
    corpus_tags=[]
    corpus_files=[]

    for dirName, subdirList, fileList in os.walk(dir):
        for fname in fileList:
            try:
                temp_text,temp_tags=process_annotated_file(dir + fname)
                corpus_text.append(temp_text)
                corpus_tags.append(temp_tags)
                corpus_files.append(fname)
            except UnicodeDecodeError:
                continue
    return (corpus_text,corpus_tags,corpus_files)

In [14]:
process_annotated_file('/home/vslchu/w266/project/data/eventstatus_eng/ltw_eng_199708_doc_813.txt')

([['A',
   'more',
   'feasible',
   'problem',
   'for',
   'the',
   'nuclear',
   'industry',
   ',',
   'according',
   'to',
   'some',
   'analysts',
   ',',
   'could',
   'be',
   'organized',
   'demonstrations',
   'that',
   'disrupt',
   'or',
   'halt',
   'fuel',
   'shipments',
   '.',
   'Shipping',
   'of',
   'spent-fuel',
   'casks',
   'has',
   'become',
   'a',
   'major',
   'political',
   'issue',
   'in',
   'Germany',
   ',',
   'where',
   'earlier',
   'this',
   'year',
   'thousands',
   'protested',
   'against',
   'a',
   'shipment',
   'of',
   'six',
   'casks',
   'by',
   'rail',
   'and',
   'truck',
   'to',
   'a',
   'storage',
   'facility',
   'in',
   'the',
   'northern',
   'farm',
   'town',
   'of',
   'Gorleben',
   '.',
   'There',
   'were',
   'about',
   '500',
   'arrests',
   ',',
   'and',
   'some',
   '150',
   'demonstrators',
   'and',
   '20',
   'police',
   'officers',
   'were',
   'injured',
   '.']],
 [['PA']])

In [42]:
corpus_text,corpus_tags,corpus_files=process_annotated_files_dir(root_dir)

print "len(corpus_text) = ", len(corpus_text)
print "len(corpus_tags) = ", len(corpus_tags)
print "len(corpus_files) = ", len(corpus_files)

len(corpus_text) =  2953
len(corpus_tags) =  2953
len(corpus_files) =  2953


In [55]:
##############################################################
# Functions v2
##############################################################

def process_tokenized_annotated_file_v2(tokens):

    chunk=[]
    chunk_tag=[]

    events_matrix=[]
    
    chunk_on=False
    bars_on=False

    for i in range(len(tokens)):        
        if tokens[i]=='>':
            if tokens[i-1]=='CHUNK':
                chunk_on=True
            elif tokens[i-1]=='/CHUNK':
                chunk_on=False
                chunk=chunk[:-2]
                
                for j in range(len(chunk_tag)):                    
                    events_matrix.append([chunk, chunk_tag[j]])
                    
                chunk=[]
                chunk_tag=[]
        elif chunk_on==True:
            if tokens[i]=='|||':
                bars_on=True
            elif tokens[i].find("NO=") > -1:
                bars_on=False
            elif bars_on:  
                if tokens[i-1]=='|||':
                    chunk_tag.append(tokens[i])
            else:
                chunk.append(tokens[i])
    
    return events_matrix #(events, events_tags)

def process_annotated_file_v2(file_path):
    f=open(file_path,'rU')
    raw=f.read()
    tokens=tokenize.word_tokenize(raw)
    return process_tokenized_annotated_file_v2(tokens)

def process_annotated_files_dir_v2(dir):
    
    events_matrix=[]
    
    for dirName, subdirList, fileList in os.walk(dir):
        for fname in fileList:
            try:
                temp_matrix=process_annotated_file_v2(dir + fname)                

                for i in range(len(temp_matrix)):
                    temp_matrix[i].append(fname)                                            
                
                events_matrix=events_matrix+temp_matrix
            except UnicodeDecodeError:
                continue
    return events_matrix 

def process_chunk(chunk):    
    chunk_sent = " ".join(chunk)    
    return re.sub("[^a-zA-Z']", " ", chunk_sent)

In [56]:
temp_list = process_annotated_file_v2('/home/vslchu/w266/project/data/eventstatus_eng/apw_eng_199603_doc_4538.txt')
print np.array(temp_list).shape
print temp_list

(4, 2)
[[['Thousands', 'of', 'Palestinians', ',', 'many', 'of', 'them', 'high', 'school', 'students', ',', 'protested', 'against', 'the', 'closure', 'Tuesday', 'in', 'five', 'West', 'Bank', 'rallies', 'organized', 'by', 'Arafat', "'s", 'Palestinian', 'Authority', '.'], 'PA'], [['Thousands', 'of', 'Palestinians', ',', 'many', 'of', 'them', 'high', 'school', 'students', ',', 'protested', 'against', 'the', 'closure', 'Tuesday', 'in', 'five', 'West', 'Bank', 'rallies', 'organized', 'by', 'Arafat', "'s", 'Palestinian', 'Authority', '.'], 'PA'], [['``', 'No', 'to', 'the', 'closure', ',', 'no', 'to', 'violence', 'and', 'yes', 'to', 'peace', ',', "''", 'read', 'a', 'banner', 'at', 'a', 'Bethlehem', 'protest', 'rally', '.'], 'PA'], [['``', 'No', 'to', 'the', 'closure', ',', 'no', 'to', 'violence', 'and', 'yes', 'to', 'peace', ',', "''", 'read', 'a', 'banner', 'at', 'a', 'Bethlehem', 'protest', 'rally', '.'], 'PA']]


In [58]:
events = process_annotated_files_dir_v2(root_dir)

In [59]:
print events[0][0]
print process_chunk(events[0][0])

['France', "'s", 'Stephane', 'Peterhansel', 'wrapped', 'up', 'his', 'second', 'successive', 'Dakar', 'Rally', 'title', 'on', 'Sunday', 'while', 'compatriot', 'Cyril', 'Despres', 'made', 'it', 'a', 'double', 'French', 'celebration', 'by', 'taking', 'the', 'motorcycle', 'honours', 'in', 'the', 'gruelling', 'event', '.']
France 's Stephane Peterhansel wrapped up his second successive Dakar Rally title on Sunday while compatriot Cyril Despres made it a double French celebration by taking the motorcycle honours in the gruelling event  


In [65]:
print "Cleaning and parsing the training set movie reviews...\n"
clean_chunks=[]
temporal_states=[]
event_files=[]

for i in xrange(len(events)):
    # If the index is evenly divisible by 100, print a message
    if((i+1)%500 == 0):
        print "Review %d of %d" % (i+1, len(events))                                                                    
    clean_chunks.append(process_chunk(events[i][0]))
    temporal_states.append(events[i][1])
    event_files.append(events[i][2])
    
print
print clean_chunks
print temporal_states
print event_files

Cleaning and parsing the training set movie reviews...

Review 500 of 5621
Review 1000 of 5621
Review 1500 of 5621
Review 2000 of 5621
Review 2500 of 5621
Review 3000 of 5621
Review 3500 of 5621
Review 4000 of 5621
Review 4500 of 5621
Review 5000 of 5621
Review 5500 of 5621

['NA', 'NA', 'NA', 'PA', 'PA', 'PA', 'PA', 'PA', 'NA', 'NA', 'NA', 'NA', 'NA', 'OG', 'FP', 'PA', 'NA', 'NA', 'PA', 'NA', 'NA', 'NA', 'NA', 'NA', 'PA', 'PA', 'XX', 'PA', 'PA', 'XX', 'PA', 'PA', 'NA', 'NA', 'NA', 'PA', 'NA', 'FT', 'XX', 'NA', 'NA', 'NA', 'NA', 'PA', 'FP', 'PA', 'NA', 'OG', 'OG', 'OG', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'FT', 'XX', 'PA', 'NA', 'NA', 'NA', 'PA', 'PA', 'NA', 'NA', 'PA', 'NA', 'NA', 'PA', 'NA', 'NA', 'NA', 'PA', 'NA', 'NA', 'FP', 'FP', 'FP', 'FP', 'PA', 'PA', 'PA', 'NA', 'NA', 'NA', 'PA', 'PA', 'PA', 'OG', 'PA', 'PA', 'PA', 'PA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'PA', 'PA', 'PA', 'OG', 'PA', 'OG', 'NA', 'PA', 'PA', 'PA', 'PA', 'NA', 'NA', 'NA', 'PA', 'PA

In [68]:
print "Creating the bag of words...\n"

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_chunks)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

print "Completed the bag of words...\n"
print train_data_features.shape

Creating the bag of words...

Completed the bag of words...

(5621, 5000)


In [69]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab



In [70]:
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

8 abacha
5 abandon
9 abandoned
13 abbas
4 abdel
6 abdullah
8 abhisit
5 abide
4 abidjan
7 ability
8 abiola
4 abkhazia
18 able
9 aboard
20 abortion
363 about
24 above
10 abroad
4 absence
5 absolute
4 absolutely
17 abu
4 abubakar
14 abuse
10 abuses
20 accept
6 accepted
11 access
4 accession
8 accident
17 accord
79 according
4 accords
5 account
8 accounts
5 accusations
53 accused
7 accuses
7 accusing
4 aceh
5 achieve
7 achieved
6 acknowledged
4 acquired
4 acronym
124 across
36 act
7 acting
77 action
26 actions
17 activist
99 activists
11 activities
7 activity
8 actors
8 acts
6 actual
9 actually
11 ad
6 add
39 added
33 adding
14 addition
12 additional
18 address
6 addressing
18 adds
4 adhere
4 adjacent
50 administration
8 administrative
7 administrators
7 admitted
4 admitting
4 adopted
4 ads
17 advance
9 advantage
6 advertising
4 advised
9 advocates
5 aerial
5 affair
13 affairs
10 affect
16 affected
5 affecting
27 afghan
55 afghanistan
5 afghans
32 afp
6 afraid
28 africa
37 african
4 africa

In [74]:
### Train a Random Forest classifier using 5000 out of 5621 event news chunks

# Initialize a Random Forest classifier with 100 trees
rf = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
rf = forest.fit(train_data_features[:5000], temporal_states[:5000])

In [78]:
### Generate predictions using the ramdom forest model for the remaining 621 
### event news cuhunks

pred_temporal_states = forest.predict(train_data_features[5001:])

print pred_temporal_states

['NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA'
 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'PA' 'NA' 'PA' 'PA'
 'NA' 'NA' 'PA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'PA' 'PA' 'PA' 'PA' 'NA' 'NA'
 'NA' 'NA' 'NA' 'NA' 'PA' 'PA' 'NA' 'NA' 'PA' 'NA' 'NA' 'NA' 'NA' 'PA' 'PA'
 'PA' 'NA' 'NA' 'PA' 'PA' 'NA' 'NA' 'NA' 'PA' 'PA' 'NA' 'NA' 'NA' 'OG' 'OG'
 'PA' 'PA' 'NA' 'PA' 'NA' 'NA' 'NA' 'PA' 'NA' 'NA' 'NA' 'NA' 'NA' 'PA' 'PA'
 'PA' 'PA' 'PA' 'PA' 'PA' 'NA' 'NA' 'PA' 'PA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA'
 'PA' 'NA' 'NA' 'PA' 'PA' 'NA' 'NA' 'PA' 'PA' 'PA' 'PA' 'NA' 'PA' 'PA' 'PA'
 'PA' 'PA' 'PA' 'PA' 'NA' 'PA' 'PA' 'PA' 'PA' 'PA' 'NA' 'NA' 'PA' 'NA' 'PA'
 'NA' 'NA' 'PA' 'OG' 'OG' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'PA' 'PA' 'PA'
 'NA' 'NA' 'NA' 'NA' 'NA' 'PA' 'PA' 'PA' 'PA' 'PA' 'NA' 'NA' 'PA' 'PA' 'NA'
 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'PA' 'NA' 'PA' 'PA' 'NA' 'NA' 'NA'
 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'PA' 'PA' 'NA' 'PA' 'PA' 'NA' 'NA' 'NA' 'NA'
 'NA' 'NA' '

In [89]:
### Evaluate Performance of model

print "F1 Score = %f" % f1_score(temporal_states[5001:], pred_temporal_states, average='weighted')
print "Precision Score = %f" % precision_score(temporal_states[5001:], pred_temporal_states, average='weighted')
print "Recall Score = %f" % recall_score(temporal_states[5001:], pred_temporal_states, average='weighted')

F1 Score = 0.685337
Precision Score = 0.670272
Recall Score = 0.733871



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.

