In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string
from collections import Counter
import numpy as np
#pd.set_option('chained_assignment',None)

In [2]:
hrc_train = pd.read_csv("HRC_train.tsv", sep="\t", header=None, names=["id", "text"])


bad_strings2 = ["unclassified u.s. department of state", "case no. ............", 
                "doc no. c........", "date: ..........","state dept. . produced to house select", "comm.",
                "subject to agreement on sensitive information & redactions.","no foia waiver state...........",
                "no foia waiver.",  "unclassified us department of state"]



def remove_beginning(text, i):
	try:
		j = text.index("sent")
		new = text[j:]
		return new
	except ValueError:
		print "sent not found in row " + str(i)
		return text


sno = nltk.stem.SnowballStemmer('english')
stop = set(stopwords.words('english'))

def stop_and_stem(string):
	#tokenize
	words = string.split()
    #remove stop words
	word_list = [i  for i in words if i not in stop]
	#stemming
	word_list = [sno.stem(i) for i in word_list]
	new_text = " ".join(word_list)
	return new_text

def remove_numbers(text):
    words = text.split()
    new = []
    for i in range(len(words)):
        word = words[i]
        if word[0] not in "0123456789":
            new += [word]
    return " ".join(new)
    

def remove_punctuation(text):
	text = re.sub("-", " ", text)
	text = re.sub("["+string.punctuation+"]", "", text)
	text = re.sub("\\\\", "", text)
	return text


def clean(hrc_data, bad_strings):
    hrc_copy = hrc_data.copy()
    for i in range(len(hrc_copy)):
        new_text = hrc_copy.iloc[i].text
        #remove us department of state stuff at beginning. 
        new_text = remove_beginning(new_text, i)
        #remove undesirable sets of words 
        for bad in bad_strings:
            new_text = re.sub(bad, "", new_text)
        #remove punctuation
        new_text = remove_punctuation(new_text)	
        #remove stop words and punctuation
        new_text = stop_and_stem(new_text)
        #remove words starting with a number. 
        new_text = remove_numbers(new_text)
        hrc_copy.loc[i, "text"] = new_text
    return hrc_copy

cleaned = clean(hrc_train, bad_strings2)



def unique(df):
	word_list = []
	for i in range(len(df)):
		word_list += df.iloc[i].text.split()
	return list(set(word_list))


def get_sender(df, index):
	return df.iloc[index].id

def counts(text):
	return Counter(text.split())


def feature_matrix(df):
	dict_list = []
	for i in range(len(df)):
		dict_list += [counts(df.iloc[i].text)]
	fm = pd.DataFrame(dict_list)	
	fm = fm.fillna(value=0).astype(dtype=int)
	return fm

print "done"


sent not found in row 819
sent not found in row 2451
done


In [3]:
cleaned.iloc[100].text

u'sent subject cdstategov friday septemb pm fw dni statement sent friday septemb pm nuland victoria peiton erin schmaier traci opa b6 dean nsd subject fw dni statement septemb statement director public affair nation intellig shawn turner intellig relat terrorist attack us consul benghazi libya aftermath terrorist attack us personnel facil benghazi libya intellig niti launch comprehens effort determin circumst surround assault ident mpetrat also review avail intellig determin might follow attack peopl facil libya elsewher world intellig niti collect analyz inform relat attack understand event continu evolv immedi aftermath inform led us assess attack began spontan follow protest earlier day embassi cairo provid initi assess execut branch offici member congress use inform discuss attack public provid updat becam avail throughout investig continu emphas inform gather preliminari evolv learn attack revis initi assess reflect new inform indic deliber organ terrorist attack carri extremist r

In [4]:
uniques = unique(cleaned)
print "There are ", len(uniques), " unique words."


There are  24769  unique words.


In [5]:
fm = feature_matrix(cleaned)
#fm.to_csv("feature_matrix.csv")
fm


Unnamed: 0,a01,a06,a1,a15,a18,a19,a1pac,a1u,a25,a4,...,zuiu,zuluaga,zuma,zurcher,zurich,zuu9,zuvi,zwelakh,zziti,zzq
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#now we attempt to remove columns that aren't that useful. 

#remove all columns where the word shows up less than n times in all the emails. Beware this takes a long time. 
def remove_uncommon_words(df, n):
    col_counts = df.sum(axis=0)
    words = col_counts.index.tolist()
    keeps=[word for word in words if col_counts[word] > n]
    return df[keeps]

#a = fm.sum(axis=0)
#b = fm.sum(axis=0).index.tolist()
#[i for i in b if a[i]>=6]
fm_reduced = remove_uncommon_words(fm, 5)
fm_reduced

Unnamed: 0,aaron,ababa,abandon,abba,abc,abd,abdel,abdic,abduct,abdul,...,zebari,zelaya,zero,ziglam,zimbabw,zintan,zionism,zionist,zone,zuckerman
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Power Feature Ideas
1. length of email
2. average word length
3. standard deviation of word length
4. presence of phone number? 
5. contains a certain phrase
6. 
    

In [7]:
power_features = pd.DataFrame()
# power feature implementations

# length of email
len_emails = []
for i in range(len(cleaned)):
    text = cleaned.iloc[i].text
    len_emails += [len(text.split(" "))]
power_features["len_email"] = len_emails

#average word length and sd word length
av_word_len = []
sd_word_len = []
for i in range(len(cleaned)):
    email = cleaned.iloc[i].text
    word_lens = []
    for word in email.split(" "):
        word_lens +=[len(word)]
    av_word_len += [np.mean(word_lens)]
    sd_word_len += [np.std(word_lens)]
    
power_features["avg_word_len"] = av_word_len
power_features["sd_word_len"] = sd_word_len

#function to determine whether or not phrase in each email, if phrase occurs in email, email assigned value 1, else 0. 
def find_phrase(df, phrase):
    phrase_instances = []
    for i in range(len(df)):
        if phrase in df.iloc[i].text:
            phrase_instances += [1]
        else: 
            phrase_instances += [0]
    return phrase_instances

power_features["huffingtonpost"] = find_phrase(hrc_train, "huffingtonpost")
power_features["guardian"] = find_phrase(hrc_train, "guardian.co")
power_features["state_gov"] = find_phrase(hrc_train, "state.gov")

In [8]:
power_features
#power_features.to_csv("power_features.csv", index=False)

Unnamed: 0,len_email,avg_word_len,sd_word_len,huffingtonpost,guardian,state_gov
0,24,5.541667,2.040816,0,0,0
1,33,5.606061,1.686657,0,0,0
2,118,5.050847,2.520585,0,0,1
3,433,5.445727,1.780686,0,0,0
4,285,4.859649,1.835695,0,0,0
5,35,5.342857,2.341463,0,0,0
6,120,5.166667,2.392117,0,0,0
7,72,5.527778,2.061366,0,0,0
8,50,5.200000,2.400000,0,0,0
9,106,5.349057,3.322429,0,0,0


In [9]:
#now we read in test data and make feature matrix of that using same columns as in training data. 
hrc_test = pd.read_csv("HRC_test.tsv", sep="\t", header=None, names=["text"])
hrc_test_cleaned = clean(hrc_test, bad_strings2)

hrc_test_fm = feature_matrix(hrc_test_cleaned)   
training_colnames = list(fm_reduced.columns.values)
for test_col in hrc_test_fm.columns.values:
    if test_col not in training_colnames:
        #drop this column
        hrc_test_fm.drop(test_col, axis=1, inplace=True)

# so now we have a test data feature matrix in which every column is inside the list of training words. Then we just need to add
# a bunch of columns full of zeros. 
nrow_hrc_test_fm = hrc_test_fm.shape[0]
for training_col in training_colnames:
    if training_col not in hrc_test_fm.columns.values:
        hrc_test_fm[training_col] = [0]*nrow_hrc_test_fm
        

hrc_test_fm = hrc_test_fm[training_colnames]
hrc_test_fm

Unnamed: 0,aaron,ababa,abandon,abba,abc,abd,abdel,abdic,abduct,abdul,...,zebari,zelaya,zero,ziglam,zimbabw,zintan,zionism,zionist,zone,zuckerman
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
test_colnames = list(hrc_test_fm.columns.values)
train_colnames = list(fm_reduced.columns.values)

a = [word for word in test_colnames if word not in train_colnames]
a
hrc_test_fm.to_csv("hrc_test_fm.csv", index=False)
hrc_test_fm.to_csv("hrc_test_fm_no_colnames.csv", index=False, header=False)

In [11]:
#condense fm into five rows where each row is a sender

def condense(fm, df):
    sender1 = pd.Series({col:0 for col in fm.columns.values})
    sender2 = pd.Series({col:0 for col in fm.columns.values})
    sender3 = pd.Series({col:0 for col in fm.columns.values})
    sender4 = pd.Series({col:0 for col in fm.columns.values})
    sender5 = pd.Series({col:0 for col in fm.columns.values})
    for i in range(fm.shape[0]):
        sender = get_sender(df, i)
        if sender == 1:
            sender1 = sender1.add(fm.iloc[i])
        elif sender == 2:
            sender2 = sender2.add(fm.iloc[i])
        elif sender == 3:
            sender3 = sender3.add(fm.iloc[i])
        elif sender == 4:
            sender4 = sender4.add(fm.iloc[i])
        else:
            sender5 = sender5.add(fm.iloc[i])
    dict_list = [sender1, sender2, sender3, sender4, sender5]
    p = pd.DataFrame(dict_list, index=[1,2,3,4,5])
    p.fillna(value=0).astype(dtype=int)
    return p

condensed_fm = condense(fm_reduced, cleaned)
# condensed_fm.to_csv("condensed_reduced_fm.csv")
condensed_fm


Unnamed: 0,aaron,ababa,abandon,abba,abc,abd,abdel,abdic,abduct,abdul,...,zebari,zelaya,zero,ziglam,zimbabw,zintan,zionism,zionist,zone,zuckerman
1,1,1,5,24,2,19,20,0,1,9,...,0,18,0,8,4,20,0,0,24,0
2,5,0,8,4,17,0,3,5,15,3,...,0,77,4,0,2,0,0,0,20,0
3,6,4,11,59,2,8,2,0,5,2,...,13,80,2,0,3,0,0,1,10,5
4,0,1,1,5,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,34,16,6,0,1,2,1,3,...,0,7,22,0,0,0,27,30,14,2


In [62]:
num_emails = Counter(cleaned["id"])
num_emails
condensed_fm["pm"]

1     708
2    1437
3     846
4    1329
5     250
Name: pm, dtype: int64

In [66]:
#perhaps 7051 features is too many I can reduce this by throwing out the least predictive words. 
num_emails = Counter(cleaned["id"])
condensed_fm2 = condensed_fm.copy().astype(float)
for i in [1,2,3,4,5]:
    condensed_fm2.loc[i,] = condensed_fm2.loc[i,].apply(lambda el: 100*el/num_emails[i])
variance_list = condensed_fm2.apply(np.var, axis=0).sort_values(ascending=False)

def n_highest_variance_features(variance_list, n):
    return variance_list.index.tolist()[:n]

n_highest_variance_features(variance_list, 10)

[u'pm',
 u'obama',
 u'would',
 u'american',
 u'parti',
 u'us',
 u'presid',
 u'offic',
 u'one',
 u'republican']

In [12]:
"guardian" in condensed_fm.columns.values
condensed_fm["guardian"]

1    13
2    10
3     3
4     0
5    50
Name: guardian, dtype: int64

In [13]:
#find which emails contain specific phone number. 202-647-9533 for example.

#return list of senders for string
def find_string(df, text):
    senders = []
    indices = []
    for i in range(len(df)):
        if text in df.iloc[i].text:#.split():
            senders += [get_sender(df, i)]
            indices += [i]
    print text, " found in emails:", indices
    return senders

#phone1 = find_string(hrc_train, "202-647-9533")
#print Counter(phone1)

#http = find_string(hrc_train, "http")
#print Counter(http)

clinton = find_string(hrc_train, "state.gov")
print Counter(clinton)

state.gov  found in emails: [2, 17, 21, 22, 28, 29, 34, 48, 58, 66, 68, 74, 84, 86, 100, 104, 107, 108, 114, 115, 123, 125, 128, 129, 130, 136, 143, 145, 146, 156, 160, 163, 164, 166, 167, 169, 174, 176, 180, 182, 188, 199, 203, 208, 210, 211, 216, 220, 225, 231, 232, 233, 234, 239, 247, 253, 256, 276, 279, 281, 284, 285, 288, 289, 294, 313, 315, 322, 343, 352, 362, 363, 364, 366, 383, 397, 401, 405, 418, 421, 430, 433, 440, 449, 458, 467, 486, 488, 495, 508, 512, 516, 521, 526, 532, 536, 552, 556, 558, 563, 564, 565, 566, 571, 572, 575, 593, 595, 596, 609, 613, 619, 620, 621, 638, 647, 649, 651, 652, 659, 661, 667, 669, 680, 681, 682, 686, 688, 693, 696, 699, 700, 702, 704, 706, 710, 713, 714, 716, 717, 718, 719, 720, 723, 724, 725, 733, 734, 736, 751, 758, 759, 769, 771, 775, 776, 784, 785, 787, 790, 807, 808, 815, 818, 820, 821, 835, 838, 846, 848, 850, 852, 855, 866, 868, 869, 872, 880, 883, 884, 894, 897, 898, 907, 915, 929, 930, 931, 932, 933, 946, 952, 968, 973, 976, 978, 979, 9

In [14]:
#find most commonly used in all the emails. 
mega_string = ""
for i in range(len(cleaned)):
    mega_string += cleaned.iloc[i].text + " "
Counter(mega_string.split()).most_common()[:100]

#hrc_train.iloc[83].text

[(u'sent', 8624),
 (u'subject', 8223),
 (u'pm', 4570),
 (u'us', 3355),
 (u'state', 3246),
 (u'call', 3208),
 (u'messag', 3011),
 (u'said', 2941),
 (u'origin', 2589),
 (u'would', 2538),
 (u'presid', 2268),
 (u'secretari', 2237),
 (u'fw', 2162),
 (u'obama', 2043),
 (u'time', 1922),
 (u'govern', 1874),
 (u'one', 1769),
 (u'offic', 1755),
 (u'meet', 1730),
 (u'work', 1709),
 (u'new', 1700),
 (u'american', 1637),
 (u'may', 1568),
 (u'also', 1488),
 (u'say', 1438),
 (u'secur', 1430),
 (u'like', 1390),
 (u'talk', 1373),
 (u'report', 1360),
 (u'depart', 1359),
 (u'want', 1338),
 (u'clinton', 1336),
 (u'year', 1328),
 (u'nation', 1311),
 (u'peopl', 1308),
 (u'get', 1227),
 (u'need', 1211),
 (u'right', 1203),
 (u'support', 1199),
 (u'know', 1199),
 (u'countri', 1195),
 (u'go', 1168),
 (u'make', 1168),
 (u'unit', 1155),
 (u'polit', 1148),
 (u'minist', 1144),
 (u'foreign', 1139),
 (u'hous', 1131),
 (u'offici', 1128),
 (u'issu', 1126),
 (u'parti', 1116),
 (u'see', 1093),
 (u'two', 1089),
 (u'today'

In [15]:
#split training data into labels portion and data portion, same for test data. Also do the split for test data. 
def write_labels(filepath,new_name):
    data = pd.read_csv(filepath, sep="\t", header=None, names=["id", "text"])
    labels = data["id"]
    labels.to_csv(new_name, index=False)

#write_labels("HRC_train.tsv", "training_labels.csv")

def write_train_data(filepath, new_name):
    data = pd.read_csv(filepath, sep="\t", header=None, names=["id", "text"])
    cleaned_data=clean(data, bad_strings2)
    uncleaned_text = data["text"]
    cleaned_text = cleaned_data["text"]
    uncleaned_text.to_csv("original_"+new_name, index=False,sep="\t")
    cleaned_text.to_csv("cleaned_"+new_name, index=False,sep="\t")

def write_test_data(filepath, new_name):
    data = pd.read_csv(filepath, sep="\t", header=None, names=["text"])
    cleaned_data=clean(data, bad_strings2)
    uncleaned_text = data["text"]
    cleaned_text = cleaned_data["text"]
    uncleaned_text.to_csv("original_"+new_name, index=False,sep="\t")
    cleaned_text.to_csv("cleaned_"+new_name, index=False,sep="\t")

                                                               
                                                               
                                                               
#write_train_data("HRC_train.tsv", "hrc_train_text.tsv")
#write_test_data("HRC_test.tsv", "hrc_test_text.tsv")

