In [1]:
from pathlib import Path
import sqlite3
from collections import namedtuple

import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
Article = namedtuple('Article', ['article_id', 'source_id', 'headline', 'excerpt', 'full_text', 'image_url', 'article_url'])

path_data = Path('sql')
path_data.mkdir(exist_ok=True)
db_file = path_data / 'db.sqlite'
with sqlite3.connect(str(db_file)) as con:
    cur = con.cursor()
    cur.execute('select article_id, source_id, headline, excerpt, full_text, image_url, article_url from article')
    articles = cur.fetchall()
    articles = [Article(*a) for a in articles]
    print(articles[0])



In [3]:
articles_contents = [a.full_text for a in articles]
tfidf = TfidfVectorizer().fit_transform(articles_contents)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity.A[0]

array([ 1.        ,  0.23715057,  0.4271562 ,  0.48693993,  0.214574  ,
        0.23120266,  0.3273588 ,  0.38856303,  0.24804797,  0.34432177,
        0.37117256,  0.3788596 ,  0.71673963,  0.26611566,  0.25907154,
        0.26826971,  0.27121292,  0.17211084,  0.23508159,  0.27572193,
        0.23616226,  0.22149308,  0.25428791,  0.33868341,  0.22623896,
        0.22276032,  0.18370107,  0.20475804,  0.22844708,  0.21129283,
        0.36714056,  0.35443661,  0.34877499,  0.39361897,  0.23373107,
        0.62257276,  0.32282954,  0.41299974,  0.24190328,  0.2831942 ,
        0.08338115,  0.27042393,  0.22076406,  0.29011022,  0.22383537,
        0.19864927,  0.2379626 ,  0.22660397,  0.28525043,  0.33749335,
        0.38256757,  0.3471241 ,  0.23588415,  0.        ,  0.44399643,
        0.20917786,  0.3096101 ,  0.31745805,  0.15793594,  0.30392421,
        0.23983503,  0.20510225,  0.25222782,  0.22232153,  0.30276103,
        0.10881813,  0.30440694,  0.33862037,  0.31487419,  0.33

In [4]:
print(pairwise_similarity.A[0][12])
print(pairwise_similarity.A[0][35])
print(pairwise_similarity.A[0][36])
print(pairwise_similarity.A[12][0])
print(pairwise_similarity.A[12][35])
print(pairwise_similarity.A[12][36])
print(pairwise_similarity.A[35][0])
print(pairwise_similarity.A[35][12])
print(pairwise_similarity.A[35][36])

0.716739626591
0.622572762128
0.32282954171
0.716739626591
0.7324392116
0.364319882804
0.622572762128
0.7324392116
0.326540979509


In [5]:
ensure_pairs_exist_sql = """
INSERT INTO similarities (article_id_1, article_id_2, permid, sklearn_headline, sklearn_text)
  SELECT
    a1.article_id,
    a2.article_id,
    0,
    0,
    0
  FROM article a1
    INNER JOIN article a2
      ON a1.article_id > a2.article_id
  WHERE NOT EXISTS(
      SELECT *
      FROM similarities s
      WHERE s.article_id_1 == a1.article_id
            AND s.article_id_2 == a2.article_id
  );
"""

def clear_table(cur):
    cur.execute('DELETE FROM similarities')

def update_db(cur, id_1, id_2, value, column):
    cur.execute(f'UPDATE similarities SET {column} = ? WHERE article_id_1 = ? AND article_id_2 = ?', 
                (value, id_1, id_2))

def insert_into(column, matrix, clear=False):
    with sqlite3.connect(str(db_file)) as con:
        cur = con.cursor()

        if clear:
            clear_table(cur)
            con.commit()

        cur.execute(ensure_pairs_exist_sql)
        con.commit()

        [update_db(cur, id_1 + 1, id_2 + 1, value, column) 
         for (id_1, id_2), value in np.ndenumerate(matrix)
         if id_1 > id_2]
        con.commit()

insert_into('sklearn_text', pairwise_similarity.A, True)

In [6]:
articles_hl = [a.headline for a in articles]
tfidf = TfidfVectorizer().fit_transform(articles_hl)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity.A[0]

array([ 1.        ,  0.        ,  0.18694435,  0.02970191,  0.        ,
        0.02671833,  0.04679032,  0.0573667 ,  0.        ,  0.05016216,
        0.        ,  0.        ,  0.16342996,  0.02622172,  0.02822841,
        0.        ,  0.        ,  0.02800688,  0.05094296,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.08393197,  0.02859344,  0.        ,  0.        ,
        0.09117379,  0.0869663 ,  0.        ,  0.        ,  0.        ,
        0.28524332,  0.04164032,  0.02051558,  0.        ,  0.0275247 ,
        0.        ,  0.02814507,  0.02402709,  0.        ,  0.        ,
        0.        ,  0.        ,  0.04788964,  0.02881883,  0.        ,
        0.04044454,  0.        ,  0.        ,  0.05136635,  0.07496241,
        0.        ,  0.04610664,  0.02890247,  0.        ,  0.02668986,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.04885834,  0.        ,  0.02

In [7]:
print(pairwise_similarity.A[0][12])
print(pairwise_similarity.A[0][35])
print(pairwise_similarity.A[0][36])
print(pairwise_similarity.A[12][0])
print(pairwise_similarity.A[12][35])
print(pairwise_similarity.A[12][36])
print(pairwise_similarity.A[35][0])
print(pairwise_similarity.A[35][12])
print(pairwise_similarity.A[35][36])

0.163429955153
0.285243324365
0.0416403215359
0.163429955153
0.228629149039
0.0
0.285243324365
0.228629149039
0.0363462112116


In [8]:
insert_into('sklearn_headline', pairwise_similarity.A)