In [5]:
from pathlib import Path
import sqlite3
from collections import namedtuple

import numpy as np
import sklearn
import pandas as pd
from pandas import Int64Index

In [6]:
path_data = Path('sql')
path_data.mkdir(exist_ok=True)
db_file = path_data / 'db.sqlite'
with sqlite3.connect(str(db_file)) as con:
    df = pd.read_sql(con=con, sql='select s.*, a1.source_id as source_id_1, a2.source_id as source_id_2 from similarities s inner join article a1 on s.article_id_1 = a1.article_id inner join article a2 on s.article_id_2 = a2.article_id')
df.head(5)

Unnamed: 0,article_id_1,article_id_2,permid,sklearn,sklearn1,sklearn2,sklearn3,sklearn4,source_id_1,source_id_2
0,2,1,0.018519,0.237151,0.056474,0.0,0.0,0,1,1
1,3,1,0.018182,0.427156,0.12682,0.186944,0.166858,0,1,1
2,3,2,0.020619,0.264378,0.085694,0.0,0.0,0,1,1
3,4,1,0.026016,0.48694,0.139258,0.029702,0.0,0,1,1
4,4,2,0.011382,0.344696,0.116431,0.0,0.0,0,1,1


In [7]:
def avg(row):
    return (row['sklearn'] + row['sklearn1'] + row['sklearn2'] + row['sklearn3']) / 4

def sk4(row):
    return (row['permid'] * 3 + row['sklearn'] + row['sklearn2']) / 3

def prepare_df(df, sel):
    df['sklearn_avg'] = df.apply(avg, axis=1)
    df['sklearn4'] = df.apply(sk4, axis=1)
    df = df[df['sklearn' + sel] > df['sklearn' + sel].mean()]
    df = df.sort_values('sklearn' + sel, ascending=False)
    df = df[df.source_id_1 != df.source_id_2]
    df = df.reset_index(drop=True)
    return df

def get_cliques(df):
    cliques = []
    while len(set(df.article_id_1.values + df.article_id_2.values)) >= 4:# and len(cliques) < 10:
        c = [None, None, None, None]

        first_row = df.iloc[0]
        df = df.drop(df.head(1).index)

        c[int(first_row.source_id_1) - 1] = int(first_row.article_id_1)
        c[int(first_row.source_id_2) - 1] = int(first_row.article_id_2)
        c
        index = 0

        while (c[0] is None or c[1] is None or c[2] is None or c[3] is None) and len(df) > (index + 1):
            df = df.reset_index(drop=True)
#             print(len(df), index)
            new_row = df.iloc[index]
            if c[int(new_row.source_id_1) - 1] is None and c[int(new_row.source_id_2) - 1] is not None:
                c[int(new_row.source_id_1) - 1] = int(new_row.article_id_1)
                df = df.drop(Int64Index([index+1], dtype='int64'))
            elif c[int(new_row.source_id_1) - 1] is not None and c[int(new_row.source_id_2) - 1] is None:
                c[int(new_row.source_id_2) - 1] = int(new_row.article_id_2)
                df = df.drop(Int64Index([index+1], dtype='int64'))
            else:
                index += 1
        for i in range(4):
            df = df[(df.article_id_1 != c[i]) & (df.article_id_2 != c[i])]
        cliques.append(c)

    print(len(cliques))
    print(cliques)
    return cliques

def write_db(sel, cliques):
    with sqlite3.connect(str(db_file)) as con:
        cur = con.cursor()
        cur.execute('DELETE FROM matching' + sel)
        [cur.execute(f'INSERT INTO matching{sel} (article_id, group_id) VALUES (?, ?)', (a, i))
         for i, c in enumerate(cliques)
         for a in c
         if a is not None]
        con.commit()
        
def do_it(df, sel):
    df = prepare_df(df, sel)
    cliques = get_cliques(df)
    write_db(sel, cliques)
    print('.', end='')

# cols = ['1', '2', '3', '_avg']
# [do_it(df, c) for c in cols]

In [8]:
do_it(df, '4')

24
[[1, 13, 22, 36], [79, 11, 24, 31], [89, 19, 66, 37], [86, 98, 63, 107], [90, 100, 25, 73], [8, 101, 60, 105], [10, 17, 72, 38], [4, 122, 21, 32], [3, 69, 29, 76], [110, 15, 26, 74], [7, 12, 67, 78], [9, 52, 28, 39], [96, 121, 23, 34], [93, 55, 125, 33], [114, 124, 104, 106], [50, 102, 88, 35], [91, 99, 30, 75], [97, 68, 65, 108], [42, 70, 27, 109], [84, 123, 71, 40], [113, 51, 62, 77], [94, 58, 64, None], [45, 14, 103, None], [111, 120, None, None]]
.

In [10]:
def simply_permid(df):
    df = prepare_df(df, '')
    df = df[df['permid'] > df['permid'].mean()]
    cliques = get_cliques(df)
    write_db('', cliques)
    print('.', end='')

simply_permid(df)

19
[[1, 13, 22, 36], [4, 19, 72, 37], [89, 11, 24, 31], [79, 100, 63, 77], [86, 52, 60, 73], [3, 122, 67, 32], [10, 124, 21, 38], [8, 121, 26, 107], [90, 12, 23, 105], [93, 69, 65, 76], [114, 55, 29, 34], [7, 51, 125, 33], [49, 102, 61, 75], [96, 99, 64, 39], [9, 58, 103, 78], [97, 68, None, 108], [95, 101, None, 109], [92, 57, None, None], [42, 98, None, None]]
.

In [None]:
cliques = []

while len(set(df.article_id_1.values + df.article_id_2.values)) >= 4:
    c = [None, None, None, None]
    first_row = df.iloc[0]
    # print(first_row)
    df = df.drop(df.head(1).index)

    id_1 = int(first_row.article_id_1)
    id_2 = int(first_row.article_id_2)
    c[int(first_row.source_id_1) - 1] = id_1
    c[int(first_row.source_id_2) - 1] = id_2
    print(c)
    # print(id_1)
    # print(id_2)


    df = df[~((df.article_id_1 == id_1) & (df.article_id_2 == id_2))]
    df = df[~((df.article_id_1 == id_2) & (df.article_id_2 == id_1))]


    df_tmp = df[(df.article_id_1 == id_1) | (df.article_id_2 == id_1) | 
                (df.article_id_1 == id_2) | (df.article_id_2 == id_2)]
    second_row = df_tmp.iloc[0]
    # print(second_row)
    id_3 = int(second_row.article_id_1)
    id_4 = int(second_row.article_id_2)
    c[int(second_row.source_id_1) - 1] = id_3
    c[int(second_row.source_id_2) - 1] = id_4
    print(c)
    # print(id_3)
    # print(id_4)

    if id_3 == id_1 or id_3 == id_2:
        df = df[~((df.article_id_1 == id_1) & (df.article_id_2 == id_4))]
        df = df[~((df.article_id_1 == id_2) & (df.article_id_2 == id_4))]
        df = df[~((df.article_id_1 == id_4) & (df.article_id_2 == id_1))]
        df = df[~((df.article_id_1 == id_4) & (df.article_id_2 == id_2))]
    else:
        df = df[~((df.article_id_1 == id_1) & (df.article_id_2 == id_3))]
        df = df[~((df.article_id_1 == id_2) & (df.article_id_2 == id_3))]
        df = df[~((df.article_id_1 == id_3) & (df.article_id_2 == id_1))]
        df = df[~((df.article_id_1 == id_3) & (df.article_id_2 == id_2))]

    df_tmp = df[(df.article_id_1 == id_1) | (df.article_id_2 == id_1) | 
                (df.article_id_1 == id_2) | (df.article_id_2 == id_2) |
                (df.article_id_1 == id_3) | (df.article_id_2 == id_3) | 
                (df.article_id_1 == id_4) | (df.article_id_2 == id_4)]    
    third_row = df_tmp.iloc[0]
    print(third_row)
    id_5 = int(third_row.article_id_1)
    id_6 = int(third_row.article_id_2)
    c[int(third_row.source_id_1) - 1] = id_5
    c[int(third_row.source_id_2) - 1] = id_6
    print(c)
    # print(id_5)
    # print(id_6)


    if id_5 == id_1 or id_5 == id_2 or id_5 == id_3 or id_5 == id_4:
        df = df[~((df.article_id_1 == id_1) & (df.article_id_2 == id_6))]
        df = df[~((df.article_id_1 == id_2) & (df.article_id_2 == id_6))]
        df = df[~((df.article_id_1 == id_3) & (df.article_id_2 == id_6))]
        df = df[~((df.article_id_1 == id_4) & (df.article_id_2 == id_6))]
        df = df[~((df.article_id_1 == id_6) & (df.article_id_2 == id_1))]
        df = df[~((df.article_id_1 == id_6) & (df.article_id_2 == id_2))]
        df = df[~((df.article_id_1 == id_6) & (df.article_id_2 == id_3))]
        df = df[~((df.article_id_1 == id_6) & (df.article_id_2 == id_4))]
    else:
        df = df[~((df.article_id_1 == id_1) & (df.article_id_2 == id_5))]
        df = df[~((df.article_id_1 == id_2) & (df.article_id_2 == id_5))]
        df = df[~((df.article_id_1 == id_3) & (df.article_id_2 == id_5))]
        df = df[~((df.article_id_1 == id_4) & (df.article_id_2 == id_5))]
        df = df[~((df.article_id_1 == id_5) & (df.article_id_2 == id_1))]
        df = df[~((df.article_id_1 == id_5) & (df.article_id_2 == id_2))]
        df = df[~((df.article_id_1 == id_5) & (df.article_id_2 == id_3))]
        df = df[~((df.article_id_1 == id_5) & (df.article_id_2 == id_4))]
    print(len(df))
    print(c)
    cliques.append(c)
    break

print(cliques)
    

In [None]:
cliques = []

while len(set(df.article_id_1.values + df.article_id_2.values)) >= 4 and len(cliques) < 10:
    c = [None, None, None, None]
    first_row = df.iloc[0]
#     print(first_row)
    df = df.drop(df.head(1).index)

    a_id_1 = int(first_row.article_id_1)
    a_id_2 = int(first_row.article_id_2)
    s_id_1 = int(first_row.source_id_1)
    s_id_2 = int(first_row.source_id_2)
    c[s_id_1 - 1] = a_id_1
    c[s_id_2 - 1] = a_id_2
#     print(c)
    
    df = df[~((df.source_id_1 == s_id_1) & (df.source_id_2 == s_id_2))]
    df = df[~((df.source_id_1 == s_id_2) & (df.source_id_2 == s_id_1))]
    
    df_tmp = df[(df.article_id_1 == a_id_1) | (df.article_id_1 == a_id_2) |
                (df.article_id_2 == a_id_1) | (df.article_id_2 == a_id_2)]
    
    second_row = df_tmp.iloc[0]
#     print(second_row)
    a_id_3 = int(second_row.article_id_1)
    a_id_4 = int(second_row.article_id_2)
    s_id_3 = int(second_row.source_id_1)
    s_id_4 = int(second_row.source_id_2)
    c[s_id_3 - 1] = a_id_3
    c[s_id_4 - 1] = a_id_4
    
    s_id_new = s_id_4 if (s_id_3 == s_id_1 or s_id_3 == s_id_2) else s_id_3
    a_id_new = a_id_4 if (s_id_3 == s_id_1 or s_id_3 == s_id_2) else a_id_3
    df = df[~((df.source_id_1 == s_id_1) & (df.source_id_2 == s_id_new))]
    df = df[~((df.source_id_1 == s_id_new) & (df.source_id_2 == s_id_1))]
    df = df[~((df.source_id_1 == s_id_2) & (df.source_id_2 == s_id_new))]
    df = df[~((df.source_id_1 == s_id_new) & (df.source_id_2 == s_id_2))]
    
    df_tmp = df[(df.article_id_1 == a_id_1) | (df.article_id_1 == a_id_2) | (df.article_id_1 == a_id_new) |
                (df.article_id_2 == a_id_1) | (df.article_id_2 == a_id_2) | (df.article_id_2 == a_id_new)]
    third_row = df_tmp.iloc[0]
#     print(third_row)
    a_id_5 = int(third_row.article_id_1)
    a_id_6 = int(third_row.article_id_2)
    s_id_5 = int(third_row.source_id_1)
    s_id_6 = int(third_row.source_id_2)
    c[s_id_5 - 1] = a_id_5
    c[s_id_6 - 1] = a_id_6
    
    print(len(df))
#     print(c)
    cliques.append(c)
#     break

print(cliques)
    