In [1]:
import theseus
import os
import hydraseq as hd

In [2]:
! mkdir data
! mv *.txt data/
! ls data/

mkdir: data: File exists
mv: rename *.txt to data/*.txt: No such file or directory
[32mbackground.txt[m[m [32measy_ham.txt[m[m   [32mhard_ham.txt[m[m   [32mspam.txt[m[m


In [3]:
def load_documents(path_file):
    documents = []
    with open(path_file, 'rb') as source:
        for line in source:
            try:
                documents.append(line.decode('utf-8').lower().strip().split()[1:])
            except:
                continue
    return documents

! ls data/
spam = load_documents('data/spam.txt')
easy_ham = load_documents('data/easy_ham.txt')
hard_ham = load_documents('data/hard_ham.txt')
background = spam + easy_ham + hard_ham

[32mbackground.txt[m[m [32measy_ham.txt[m[m   [32mhard_ham.txt[m[m   [32mspam.txt[m[m


In [4]:
n_background = theseus.Node(background)
n_spam = theseus.Node(spam)
n_easy_ham = theseus.Node(easy_ham)
n_hard_ham = theseus.Node(hard_ham)

In [5]:
#theseus.node.visualize(n_spam, n_background, axis_lims=(0.0, 0.3), magnification=10.0)
idx = 6
x, y, keys = theseus.node.create_xy_table(theseus.Node(spam[idx:idx+1]), n_spam, cutoff1=100, cutoff2=100, ratio=0.20)
for row in zip(keys, zip(x, y)):
    if row[1][1] != 0:
        print(row)

('fortune', (0.14285714285714285, 0.0021116138763197585))
('500', (0.14285714285714285, 0.0021116138763197585))
('company', (0.14285714285714285, 0.0018099547511312218))
('at', (0.14285714285714285, 0.005429864253393665))
('home', (0.14285714285714285, 0.0048265460030165915))
('reps.', (0.14285714285714285, 0.0018099547511312218))


In [6]:
def count_hits(group, target, friendly_name, limit, ratio):
    hits = 0
    lgroup = len(group)
    total = len(background)
    for idx in range(total):
        x, y, keys = theseus.node.create_xy_table(theseus.Node(group[idx:idx+1]), target, cutoff1=100, cutoff2=100, ratio=ratio)
        thits = [ row[1][1] for row in zip(keys, zip(x, y)) if row[1][1] != 0]
        if len(thits) > limit:
            hits += 1
    percent = float(hits)/lgroup * 100
    print(percent, " percent predicted are ", friendly_name, " has ", hits, " hits out of ", lgroup)

limit = 4
ratio = 0.4
target = n_spam
count_hits(spam,     target, "spam",     limit, ratio)
count_hits(hard_ham, target, "hard_ham", limit, ratio)
count_hits(easy_ham, target, "easy_ham", limit, ratio)

17.06827309236948  percent predicted are  spam  has  85  hits out of  498
1.76678445229682  percent predicted are  hard_ham  has  5  hits out of  283
1.9700839109813937  percent predicted are  easy_ham  has  54  hits out of  2741


In [63]:
hdr = hd.Hydraseq('')
for sentence in background:
    hdr.full_insert(" ".join(sentence))
len(hdr.columns)

50831

In [64]:
print(" ".join(background[2]))
print('---------------------------------------------------')
for convo in hdr.convolutions(" ".join(background[2])):
    elems = [cons for cons in convo[2] if cons.startswith('_')]
    for elem in elems:
        print(elem,"\t",hdr.columns[elem][0].get_sequence_nodes())

guaranteed to lose 10-12 lbs in 30 days 11.150
---------------------------------------------------
_49 	 [[guaranteed]]
_50 	 [[guaranteed], [to]]
_65 	 [[to]]
_51 	 [[guaranteed], [to], [lose]]
_66 	 [[to], [lose]]
_79 	 [[lose]]
_52 	 [[guaranteed], [to], [lose], [10-12]]
_67 	 [[to], [lose], [10-12]]
_80 	 [[lose], [10-12]]
_91 	 [[10-12]]
_53 	 [[guaranteed], [to], [lose], [10-12], [lbs]]
_68 	 [[to], [lose], [10-12], [lbs]]
_81 	 [[lose], [10-12], [lbs]]
_92 	 [[10-12], [lbs]]
_101 	 [[lbs]]
_54 	 [[guaranteed], [to], [lose], [10-12], [lbs], [in]]
_69 	 [[to], [lose], [10-12], [lbs], [in]]
_82 	 [[lose], [10-12], [lbs], [in]]
_93 	 [[10-12], [lbs], [in]]
_102 	 [[lbs], [in]]
_109 	 [[in]]
_55 	 [[guaranteed], [to], [lose], [10-12], [lbs], [in], [30]]
_70 	 [[to], [lose], [10-12], [lbs], [in], [30]]
_83 	 [[lose], [10-12], [lbs], [in], [30]]
_94 	 [[10-12], [lbs], [in], [30]]
_103 	 [[lbs], [in], [30]]
_110 	 [[in], [30]]
_115 	 [[30]]
_56 	 [[guaranteed], [to], [lose], [10-12], [l

In [62]:
hdr.columns['_35'][0].lasts[0].nexts

[_1265, 10-12, _35]