In [148]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

In [200]:
website_data = pd.read_table('data/links.txt', header=0, dtype='str', encoding='utf-8', sep='\t')

In [203]:
#take sample of 50 websites for testing (smaller matrix)
sample50 = website_data.sample(50)

In [204]:
def make_soup(url):
    try:
        r = requests.get(url)
        html = r.content
        data = BeautifulSoup(html, 'lxml')
    except Exception as e:
        print(e)
    else:
        return data

In [206]:
#get link bodies
websites = {}

for i,d in sample50.iterrows():
    
    print('searching: ', i)
    soup = make_soup(d['link'])
    
    if soup != None:
        
        #remove css <style> and javascript <script> elements from soup
        for script in soup(["script", "style"]):
            script.extract()
        
        #get text from soup
        text = soup.get_text()
        #split lines, strip and remove nulls (blank lines)
        lines = [line.strip() for line in text.splitlines() if line]
        #join it back
        text = '\n'.join(lines)
        
        #add link to websites
        websites[d['link']] = {}
        #add link title to link object
        websites[d['link']]['title'] = (soup.title.text if soup.title else '')
        #add link text to link object
        websites[d['link']]['text'] = text
        
print('FINISHED')

searching:  81
searching:  137
searching:  10
searching:  60
searching:  18
searching:  133
searching:  44
searching:  78
searching:  92
searching:  145
searching:  195
searching:  79
searching:  4
searching:  45
searching:  127
searching:  50
searching:  53
searching:  192
searching:  46
searching:  40
searching:  19
searching:  51
searching:  129
searching:  167
searching:  31
searching:  138
searching:  74
searching:  39
searching:  178
searching:  180
searching:  95
searching:  111
searching:  114
searching:  85
searching:  136
searching:  142
searching:  156
searching:  26
searching:  130
searching:  9
searching:  194
searching:  196
searching:  175
searching:  1
searching:  56
searching:  109
searching:  140
searching:  82
searching:  184
searching:  70
FINISHED


In [207]:
#save all text documents as list
all_text = [link['text'] for link in websites.values()]

In [208]:
count_vect = CountVectorizer()
train_matrix_cnt = count_vect.fit_transform(all_text)

In [209]:
#explore sparse matrix
print('sparse matrix shape:', train_matrix_cnt.shape)
print('size:', (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))
print('non-zeros:', train_matrix_cnt.getnnz())
print('sparsity: %.2f%%' % (100.0 * (((train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]) - train_matrix_cnt.getnnz()) / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))))
print('density: %.2f%%' % (100.0 * train_matrix_cnt.getnnz() / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1])))

sparse matrix shape: (50, 5885)
size: 294250
non-zeros: 14711
sparsity: 95.00%
density: 5.00%


In [210]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
train_matrix_tfidf = TfidfTransformer().fit_transform(train_matrix_cnt)

In [211]:
#construct square doc matrix, of cosine-similarity values, using pandas dataframe
X = pd.DataFrame()

for i in range(len(websites)):
    
    complete = round((i/len(websites))*100)
    if complete % 2 == 0:
        print('%s%% complete' % complete)
    
    #compute dot product cosine similarity of indexed document with all others
    cosine_similarities = linear_kernel(train_matrix_tfidf[i], train_matrix_tfidf).flatten()
    s = pd.Series(cosine_similarities)
    df = pd.DataFrame(s, index=list(s.index)).T
    X = X.append(df, ignore_index=True)

print('100%% complete')

0% complete
2% complete
4% complete
6% complete
8% complete
10% complete
12% complete
14% complete
16% complete
18% complete
20% complete
22% complete
24% complete
26% complete
28% complete
30% complete
32% complete
34% complete
36% complete
38% complete
40% complete
42% complete
44% complete
46% complete
48% complete
50% complete
52% complete
54% complete
56% complete
58% complete
60% complete
62% complete
64% complete
66% complete
68% complete
70% complete
72% complete
74% complete
76% complete
78% complete
80% complete
82% complete
84% complete
86% complete
88% complete
90% complete
92% complete
94% complete
96% complete
98% complete
100%% complete


In [None]:
#construct document similarity matrix

In [212]:
#write matrix cols
with open('website_matrix/d3matrix_datacols.txt', 'w') as f:
    for link in websites.keys():
        f.write(link + '\n')

In [214]:
header = ['link1', 'link2', 'value']
h = pd.DataFrame(columns=header)
h.to_csv('website_matrix/d3matrix_data.tsv', mode='w', sep='\t', encoding='utf-8', index=False, header=True)

#l1, l2 are viz matrix indices
#index1, index2 are sparse matrix indices

keys = list(websites.keys())

l1 = 1
for link1 in websites.keys():
    index1 = keys.index(link1)
    l2 = 1
    for link2 in websites.keys():
        index2 = keys.index(link2)
        co = (X.iloc[index1][index2] if link1 != link2 else 0)
        r = [l1, l2, co]
        row = pd.Series(r, index=header)
        row = pd.DataFrame(r).T
        row.to_csv('website_matrix/d3matrix_data.tsv', mode='a', sep='\t', encoding='utf-8', index=False, header=False)
        l2+=1
    l1+=1