In [None]:
from pathlib import Path
import sqlite3
from collections import namedtuple

import numpy as np
import sklearn

In [None]:
Article = namedtuple('Article', ['article_id', 'source_id', 'headline', 'excerpt', 'full_text', 'image_url', 'article_url'])

path_data = Path('sql')
path_data.mkdir(exist_ok=True)
db_file = path_data / 'db.sqlite'
with sqlite3.connect(str(db_file)) as con:
    cur = con.cursor()
    cur.execute('select article_id, source_id, headline, excerpt, full_text, image_url, article_url from article')
    articles = cur.fetchall()
    articles = [Article(*a) for a in articles]
    print(articles[0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
articles_contents = [a.full_text for a in articles]
tfidf = TfidfVectorizer().fit_transform(articles_contents)

In [None]:
pairwise_similarity = tfidf * tfidf.T

In [None]:
pairwise_similarity.A[0]

In [None]:
print(pairwise_similarity.A[0][12])
print(pairwise_similarity.A[0][35])
print(pairwise_similarity.A[0][36])
print(pairwise_similarity.A[12][0])
print(pairwise_similarity.A[12][35])
print(pairwise_similarity.A[12][36])
print(pairwise_similarity.A[35][0])
print(pairwise_similarity.A[35][12])
print(pairwise_similarity.A[35][36])

In [None]:
ensure_pairs_exist_sql = """
INSERT INTO similarities (article_id_1, article_id_2, permid, sklearn)
  SELECT
    a1.article_id,
    a2.article_id,
    0,
    0
  FROM article a1
    INNER JOIN article a2
      ON a1.article_id > a2.article_id
  WHERE NOT EXISTS(
      SELECT *
      FROM similarities s
      WHERE s.article_id_1 == a1.article_id
            AND s.article_id_2 == a2.article_id
  );
"""

def clear_table(cur):
    cur.execute('DELETE FROM similarities')

def update_db(cur, id_1, id_2, value):
    cur.execute('UPDATE similarities SET sklearn = ? WHERE article_id_1 = ? AND article_id_2 = ?', 
                (value, id_1, id_2))

with sqlite3.connect(str(db_file)) as con:
    cur = con.cursor()
    
    clear_table(cur)
    con.commit()
        
    cur.execute(ensure_pairs_exist_sql)
    con.commit()
    
    [update_db(cur, id_1 + 1, id_2 + 1, value) 
     for (id_1, id_2), value in np.ndenumerate(pairwise_similarity.A)
     if id_1 > id_2]
    con.commit()