In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string
from collections import Counter
import numpy as np

In [2]:
hrc_train = pd.read_csv("HRC_train.tsv", sep="\t", header=None, names=["id", "text"])


bad_strings2 = ["unclassified u.s. department of state", "case no. ............", 
                "doc no. c........", "date: ..........","state dept. . produced to house select benghazi comm.",
                "subject to agreement on sensitive information & redactions.","no foia waiver state...........",
                "no foia waiver.",  "unclassified us department of state"]



def remove_beginning(text, i):
	try:
		j = text.index("sent")
		new = text[j:]
		return new
	except ValueError:
		print "sent not found in row " + str(i)
		return text


sno = nltk.stem.SnowballStemmer('english')
stop = set(stopwords.words('english'))

def stop_and_stem(string):
	#tokenize
	words = string.split()
    #remove stop words
	word_list = [i  for i in words if i not in stop]
	#stemming
	word_list = [sno.stem(i) for i in word_list]
	new_text = " ".join(word_list)
	return new_text

def remove_numbers(text):
    words = text.split()
    new = []
    for i in range(len(words)):
        word = words[i]
        if word[0] not in "0123456789":
            new += [word]
    return " ".join(new)
    

def remove_punctuation(text):
	text = re.sub("-", " ", text)
	text = re.sub("["+string.punctuation+"]", "", text)
	text = re.sub("\\\\", "", text)
	return text


def clean(hrc_data, bad_strings):
    hrc_copy = hrc_data.copy()
    for i in range(len(hrc_copy)):
        new_text = hrc_copy.iloc[i].text
        #remove us department of state stuff at beginning. 
        new_text = remove_beginning(new_text, i)
        #remove undesirable sets of words 
        for bad in bad_strings:
            new_text = re.sub(bad, "", new_text)
        #remove punctuation
        new_text = remove_punctuation(new_text)	
        #remove stop words and punctuation
        new_text = stop_and_stem(new_text)
        #remove words starting with a number. 
        new_text = remove_numbers(new_text)
        hrc_copy.loc[i, "text"] = new_text
    return hrc_copy

cleaned = clean(hrc_train, bad_strings2)



def unique(df):
	word_list = []
	for i in range(len(df)):
		word_list += df.iloc[i].text.split()
	return list(set(word_list))


def get_sender(df, index):
	return df.iloc[index].id

def counts(text):
	return Counter(text.split())


def feature_matrix(df):
	dict_list = []
	for i in range(len(df)):
		dict_list += [counts(df.iloc[i].text)]
	fm = pd.DataFrame(dict_list)	
	fm = fm.fillna(value=0).astype(dtype=int)
	return fm

print "done"


sent not found in row 819
sent not found in row 2451
done


In [4]:
cleaned.iloc[4].text



u'sent sunday march subject fw abz hbj libya west bankgaza fyi b6 sent sunday march mcdonough deni r burn william well alic g schrepel dawn schlicher ronald l sanderson janet wail roebuck william v bondi steven c tsou lesli sutphin paul r talwar pune prern g rudman mara mitchel georg kumar hale david sachar alon neaipa olson richard g lebaron joseph e cretz gene polaschlk joan bark ruggl erica steven john c subject abz hb3 libya west bankgaza sbu back back phone call morn qatari pmfm hamad bin jassim uae foreign minist abdullah bin zay content virtual ident su est either rare coincid like unusu level uae qatari coordin yesterday arab leagu meet main point most libya relat one palestinian relat follow sbu fli zone arab leagu call nfz unsc need act syria last hold resolut pass time wast hbj abz made right kind nois still less solid commit would kind qatari emirati particip intern effort chicken egg question go support nfz talk particip sbu recogn council urg us note arab leagu statement 

In [None]:
uniques = unique(cleaned)
print "There are ", len(uniques), " unique words."


In [146]:
fm = feature_matrix(cleaned)
fm


Unnamed: 0,a01,a06,a1,a15,a18,a19,a1pac,a1u,a25,a4,...,zuiu,zuluaga,zuma,zurcher,zurich,zuu9,zuvi,zwelakh,zziti,zzq
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
#now we attempt to remove columns that aren't that useful. 

#remove all columns where the word shows up less than n times in all the emails. Beware this takes a long time. 
def remove_uncommon_words(df, n):
    col_counts = df.sum(axis=0)
    words = col_counts.index.tolist()
    keeps=[word for word in words if col_counts[word] > n]
    return df[keeps]

#a = fm.sum(axis=0)
#b = fm.sum(axis=0).index.tolist()
#[i for i in b if a[i]>=6]
fm_reduced = remove_uncommon_words(fm, 5)
fm_reduced

Unnamed: 0,aaron,ababa,abandon,abba,abc,abd,abdel,abdic,abduct,abdul,...,zebari,zelaya,zero,ziglam,zimbabw,zintan,zionism,zionist,zone,zuckerman
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [159]:
#condense fm into five rows where each row is a sender

def condense(fm, df):
    sender1 = pd.Series({col:0 for col in fm.columns.values})
    sender2 = pd.Series({col:0 for col in fm.columns.values})
    sender3 = pd.Series({col:0 for col in fm.columns.values})
    sender4 = pd.Series({col:0 for col in fm.columns.values})
    sender5 = pd.Series({col:0 for col in fm.columns.values})
    for i in range(fm.shape[0]):
        sender = get_sender(df, i)
        if sender == 1:
            sender1 = sender1.add(fm.iloc[i])
        elif sender == 2:
            sender2 = sender2.add(fm.iloc[i])
        elif sender == 3:
            sender3 = sender3.add(fm.iloc[i])
        elif sender == 4:
            sender4 = sender4.add(fm.iloc[i])
        else:
            sender5 = sender5.add(fm.iloc[i])
    dict_list = [sender1, sender2, sender3, sender4, sender5]
    p = pd.DataFrame(dict_list, index=[1,2,3,4,5])
    p.fillna(value=0).astype(dtype=int)
    return p

condensed_fm = condense(fm_reduced, cleaned)
condensed_fm


Unnamed: 0,aaron,ababa,abandon,abba,abc,abd,abdel,abdic,abduct,abdul,...,zebari,zelaya,zero,ziglam,zimbabw,zintan,zionism,zionist,zone,zuckerman
1,1,1,5,24,2,19,20,0,1,9,...,0,18,0,8,4,20,0,0,24,0
2,5,0,8,4,17,0,3,5,15,3,...,0,77,4,0,2,0,0,0,20,0
3,6,4,11,59,2,8,2,0,5,2,...,13,80,2,0,3,0,0,1,10,5
4,0,1,1,5,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,34,16,6,0,1,2,1,3,...,0,7,22,0,0,0,27,30,14,2


In [158]:
condensed_fm.to_csv("condensed_reduced_fm.csv")

In [164]:
test = pd.read_csv("condensed_reduced_fm.csv",index_col=0)
test

Unnamed: 0,aaron,ababa,abandon,abba,abc,abd,abdel,abdic,abduct,abdul,...,zebari,zelaya,zero,ziglam,zimbabw,zintan,zionism,zionist,zone,zuckerman
1,1,1,5,24,2,19,20,0,1,9,...,0,18,0,8,4,20,0,0,24,0
2,5,0,8,4,17,0,3,5,15,3,...,0,77,4,0,2,0,0,0,20,0
3,6,4,11,59,2,8,2,0,5,2,...,13,80,2,0,3,0,0,1,10,5
4,0,1,1,5,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,34,16,6,0,1,2,1,3,...,0,7,22,0,0,0,27,30,14,2


In [22]:
#put feature matrix into csv file. 
fm.to_csv("feature_matrix.csv")
print "done"

done


In [72]:
#make smaller feature matrix to ensure 
#hrc_train.iloc[92].text
a = fm.columns.values
print a[1000:6100]

[u'095503' u'095524' u'095548' ..., u'9842' u'985' u'98510']


In [122]:
#find which emails contain specific phone number. 202-647-9533 for example.

#return list of senders for string
def find_string(df, text):
    senders = []
    indices = []
    for i in range(len(df)):
        if text in df.iloc[i].text.split():
            senders += [get_sender(df, i)]
            indices += [i]
    print text, " found in emails:", indices
    return senders

#phone1 = find_string(hrc_train, "202-647-9533")
#print Counter(phone1)

#http = find_string(hrc_train, "http")
#print Counter(http)

clinton = find_string(cleaned, "clinton")
print Counter(clinton)

clinton  found in emails: [12, 13, 28, 31, 35, 37, 38, 46, 52, 64, 75, 78, 81, 83, 87, 88, 92, 96, 105, 121, 126, 129, 148, 154, 158, 164, 165, 176, 178, 209, 236, 237, 240, 248, 253, 269, 270, 273, 275, 277, 283, 299, 336, 343, 345, 373, 399, 417, 430, 438, 440, 441, 443, 456, 457, 460, 461, 489, 524, 536, 555, 557, 558, 559, 566, 567, 588, 611, 635, 658, 663, 664, 688, 689, 691, 706, 710, 712, 713, 714, 715, 719, 720, 756, 759, 770, 772, 775, 790, 791, 792, 800, 812, 815, 837, 838, 848, 850, 859, 864, 905, 934, 943, 1002, 1006, 1011, 1017, 1018, 1025, 1028, 1041, 1052, 1053, 1056, 1058, 1059, 1061, 1068, 1099, 1131, 1135, 1143, 1153, 1154, 1202, 1216, 1224, 1229, 1230, 1233, 1266, 1276, 1280, 1304, 1308, 1316, 1353, 1354, 1364, 1365, 1368, 1380, 1383, 1389, 1402, 1419, 1451, 1458, 1464, 1484, 1544, 1578, 1584, 1588, 1589, 1591, 1592, 1594, 1595, 1597, 1598, 1600, 1624, 1626, 1627, 1635, 1637, 1640, 1656, 1663, 1712, 1713, 1718, 1719, 1726, 1733, 1743, 1744, 1745, 1758, 1762, 1767, 17

[5]

In [77]:
#find most commonly used in all the emails. 
mega_string = ""
for i in range(len(cleaned)):
    mega_string += cleaned.iloc[i].text + " "
Counter(mega_string.split()).most_common()[:100]



[(u'sent', 8624),
 (u'subject', 8223),
 (u'2010', 5019),
 (u'pm', 4570),
 (u'2009', 3928),
 (u'us', 3355),
 (u'state', 3246),
 (u'call', 3208),
 (u'messag', 3011),
 (u'said', 2941),
 (u'origin', 2589),
 (u'would', 2538),
 (u'presid', 2268),
 (u'secretari', 2237),
 (u'fw', 2162),
 (u'obama', 2043),
 (u'time', 1922),
 (u'govern', 1874),
 (u'one', 1769),
 (u'offic', 1755),
 (u'meet', 1730),
 (u'work', 1709),
 (u'new', 1700),
 (u'american', 1637),
 (u'may', 1568),
 (u'also', 1488),
 (u'say', 1438),
 (u'secur', 1430),
 (u'like', 1390),
 (u'talk', 1373),
 (u'report', 1360),
 (u'depart', 1359),
 (u'want', 1338),
 (u'clinton', 1336),
 (u'year', 1328),
 (u'nation', 1311),
 (u'peopl', 1308),
 (u'get', 1227),
 (u'need', 1211),
 (u'right', 1203),
 (u'support', 1199),
 (u'know', 1199),
 (u'countri', 1195),
 (u'go', 1168),
 (u'make', 1168),
 (u'unit', 1155),
 (u'polit', 1148),
 (u'minist', 1144),
 (u'foreign', 1139),
 (u'hous', 1131),
 (u'offici', 1128),
 (u'issu', 1126),
 (u'parti', 1116),
 (u'see'

In [23]:
#split training data into labels portion and data portion, same for test data. Also do the split for test data. 
def write_labels(filepath,new_name):
    data = pd.read_csv(filepath, sep="\t", header=None, names=["id", "text"])
    labels = data["id"]
    labels.to_csv(new_name, index=False)

#write_labels("HRC_train.tsv", "training_labels.csv")

def write_train_data(filepath, new_name):
    data = pd.read_csv(filepath, sep="\t", header=None, names=["id", "text"])
    cleaned_data=clean(data, bad_strings2)
    uncleaned_text = data["text"]
    cleaned_text = cleaned_data["text"]
    uncleaned_text.to_csv("original_"+new_name, index=False,sep="\t")
    cleaned_text.to_csv("cleaned_"+new_name, index=False,sep="\t")

def write_test_data(filepath, new_name):
    data = pd.read_csv(filepath, sep="\t", header=None, names=["text"])
    cleaned_data=clean(data, bad_strings2)
    uncleaned_text = data["text"]
    cleaned_text = cleaned_data["text"]
    uncleaned_text.to_csv("original_"+new_name, index=False,sep="\t")
    cleaned_text.to_csv("cleaned_"+new_name, index=False,sep="\t")

                                                               
                                                               
                                                               
#write_train_data("HRC_train.tsv", "hrc_train_text.tsv")
write_test_data("HRC_test.tsv", "hrc_test_text.tsv")



In [25]:
test = pd.read_csv("cleaned_hrc_test_text.tsv", sep="\t", header=None,names=["text"])
#test = clean(test, bad_strings2)
test

Unnamed: 0,text
0,sent saturday septemb subject fw nalbandian st...
1,sent monday may pm subject lavrov per discuss ...
2,sent sunday august subject strobe strobe talbo...
3,sent monday januari subject offic autorepli np...
4,sent monday march subject fw haiti recoveri bl...
5,sent thursday june pm subject fw kurt campbel ...
6,sent saturday march pm subject abu mazen yup a...
7,sent monday septemb subject fw thank sent sund...
8,sent j nstategov saturday august pm h abedin h...
9,sent friday novemb subject offic need want catch
