In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
website_data = pd.read_table('data/links_annotated.txt', header=0, dtype='str', encoding='utf-8', sep='\t')

In [3]:
website_data = website_data[website_data['title'].notnull()]

Unnamed: 0,group,title,link
0,KIN,Marleen Huysman,http://www.kinresearch.nl/person/marleen-huysman/
1,KIN,Bart van den Hooff,http://www.kinresearch.nl/person/bart-van-den-...
2,KIN,Han Gerrits,http://www.kinresearch.nl/person/han-gerrits/
3,KIN,Raghu Garud,http://www.kinresearch.nl/person/raghu-garud/
4,KIN,Samer Faraj,http://www.kinresearch.nl/samer-faraj/


In [99]:
def make_soup(url):
    try:
        r = requests.get(url)
        html = r.content
        data = BeautifulSoup(html, 'lxml')
    except Exception as e:
        print(e)
    else:
        return data

In [100]:
#get link bodies
websites = {}

for i,d in website_data.iterrows():
    
    print('searching: ', i)
    soup = make_soup(d['link'])
    
    if soup != None:
        
        #remove css <style> and javascript <script> elements from soup
        for script in soup(["script", "style"]):
            script.extract()
        
        #get text from soup
        text = soup.get_text()
        #split lines, strip and remove nulls (blank lines)
        lines = [line.strip() for line in text.splitlines() if line]
        #join it back
        text = '\n'.join(lines)
        
        #add link name to websites
        websites[d['title']] = {}
        #add link to object
        websites[d['title']]['link'] = d['link']
        #add link group to object
        websites[d['title']]['group'] = d['group']
        #add link text to link object
        websites[d['title']]['text'] = text
        
print('FINISHED')

searching:  0
searching:  1
searching:  2
searching:  3
searching:  4
searching:  5
searching:  6
searching:  7
searching:  8
searching:  9
searching:  10
searching:  11
searching:  12
searching:  13
searching:  14
searching:  15
searching:  16
searching:  17
searching:  18
searching:  19
searching:  20
searching:  21
searching:  22
searching:  23
searching:  24
searching:  25
searching:  26
searching:  27
searching:  28
searching:  29
searching:  30
searching:  31
searching:  32
searching:  33
searching:  34
searching:  35
searching:  36
searching:  37
searching:  38
searching:  39
searching:  40
searching:  41
searching:  42
searching:  43
searching:  44
searching:  45
searching:  46
searching:  47
searching:  48
searching:  49
searching:  50
searching:  51
searching:  52
searching:  53
searching:  54
searching:  55
searching:  56
searching:  57
searching:  58
searching:  59
searching:  60
searching:  61
searching:  62
searching:  63
searching:  64
searching:  65
searching:  67
searc

In [101]:
json.dump(websites, open('data/website_text.json','w'), default=str, sort_keys=True, indent=4)

In [7]:
with open('data/website_text.json') as f:
    websites = json.load(f)

In [8]:
#add length of website text
for site in websites:
    websites[site]['length'] = len(websites[site]['text'])

In [9]:
#filter
websites_filtered = {}

for site in websites:
    if websites[site]['length'] > 4000:
        websites_filtered.update({site:websites[site]})

In [10]:
#save all text documents as list
all_text = [link['text'] for link in websites_filtered.values()]

In [112]:
count_vect = CountVectorizer()
train_matrix_cnt = count_vect.fit_transform(all_text)

In [113]:
#explore sparse matrix
print('sparse matrix shape:', train_matrix_cnt.shape)
print('size:', (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))
print('non-zeros:', train_matrix_cnt.getnnz())
print('sparsity: %.2f%%' % (100.0 * (((train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]) - train_matrix_cnt.getnnz()) / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))))
print('density: %.2f%%' % (100.0 * train_matrix_cnt.getnnz() / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1])))

sparse matrix shape: (95, 13662)
size: 1297890
non-zeros: 45264
sparsity: 96.51%
density: 3.49%


In [114]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
train_matrix_tfidf = TfidfTransformer().fit_transform(train_matrix_cnt)

In [139]:
#construct square doc matrix, of cosine-similarity values, using pandas dataframe
X = pd.DataFrame()

for i in range(len(websites_filtered)):
    
    complete = round((i/len(websites_filtered))*100)
    if complete % 2 == 0:
        print('%s%% complete' % complete)
    
    #compute dot product cosine similarity of indexed document with all others
    cosine_similarities = linear_kernel(train_matrix_tfidf[i], train_matrix_tfidf).flatten()
    s = pd.Series(cosine_similarities)
    df = pd.DataFrame(s, index=list(s.index)).T
    X = X.append(df, ignore_index=True)

print('100%% complete')

0% complete
2% complete
4% complete
6% complete
8% complete
12% complete
14% complete
16% complete
18% complete
20% complete
22% complete
24% complete
26% complete
28% complete
32% complete
34% complete
36% complete
38% complete
40% complete
42% complete
44% complete
46% complete
48% complete
52% complete
54% complete
56% complete
58% complete
60% complete
62% complete
64% complete
66% complete
68% complete
72% complete
74% complete
76% complete
78% complete
80% complete
82% complete
84% complete
86% complete
88% complete
92% complete
94% complete
96% complete
98% complete
100%% complete


In [11]:
#write matrix cols
with open('d3matrix_datacols.txt', 'w') as f:
    for i,doc in website_data.iterrows():
        if doc['title'] in websites_filtered.keys():
            f.write(doc['title'] + '\t' + doc['group'] + '\t' + doc['link'] + '\n')

In [141]:
header = ['link1', 'link2', 'value']
h = pd.DataFrame(columns=header)
h.to_csv('d3matrix_data.tsv', mode='w', sep='\t', encoding='utf-8', index=False, header=True)

#l1, l2 are viz matrix indices
#index1, index2 are matrix indices

keys = list(websites_filtered.keys())
orderedtitles = list(website_data['title'])

t1 = 1
for title1 in orderedtitles:
    if title1 in websites_filtered.keys():
        index1 = keys.index(title1)
        t2 = 1
        for title2 in orderedtitles:
            if title2 in websites_filtered.keys():
                index2 = keys.index(title2)
                co = (round(X.iloc[index1][index2], 2) if title1 != title2 else 0)
                r = [t1, t2, co]
                row = pd.Series(r, index=header)
                row = pd.DataFrame(r).T
                row.to_csv('d3matrix_data.tsv', mode='a', sep='\t', encoding='utf-8', index=False, header=False)
                t2+=1
        t1+=1