In [2]:
import os
import logging
import time
import psycopg2


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
 

def connect_postgres():
    database = os.environ['POSTGRESQL_DB']
    user = os.environ['POSTGRESQL_USER']
    password = os.environ['POSTGRESQL_PASSWORD']
    host = os.environ['POSTGRESQL_SERVER']
    port = 5432

    exc, conn = None, None

    for _ in range(5):
        try:
            conn = psycopg2.connect(
                database=database, user=user, password=password, host=host, port=port)
        except Exception as e:
            logging.warning("Error connecting to postgres, will retry in 3 sec: %s", e)
            time.sleep(3)
            exc = e
        else:
            logging.info("Connected...")
            logging.info("Everything goes well from Postgres, you're a fu*** pro...")
            break
    else:
        logging.error("Unable to connect to  %s DB", database)
        raise exc
    
    return conn


In [3]:
engine = connect_postgres()

INFO:root:Connected...
INFO:root:Everything goes well from Postgres, you're a fu*** pro...


In [17]:
df = pd.read_sql("""
    SELECT code, trans_date_id, copy.id as items
    FROM public.copytransaction, public.copy, public.student, public.title, public.transactiontype
    WHERE 
        trans_borrower_code = student.id and
        trans_copy_code_id = copy.id and 
        trans_tittle_code_id = title.id and 
        trans_type_id = transactiontype.id and 
        trans_location_code_id = 5 and 
        trans_type_code = 'ISS'
    GROUP BY code, trans_date_id, copy.id LIMIT 100000
""", con=engine)

In [24]:
df = df.astype({col: 'int32' for col in df.select_dtypes('int64').columns})
df_apriori = df.copy()

In [25]:
df_apriori['items'] = df_apriori['items'].apply(str)
df_apriori['items'] = df_apriori[['code', 'trans_date_id', 'items']].groupby(['code', 'trans_date_id'])['items'].transform(lambda x: ','.join(x))
df_apriori = df_apriori[['code', 'trans_date_id', 'items']].drop_duplicates()
df_apriori

Unnamed: 0,code,trans_date_id,items
0,198224610,20000928,547476547477
2,198224610,20001014,547477
3,198224610,20001117,17499
4,198224610,20001120,172235175765232177343588348365447446534944
11,198224610,20001207,16909494604516699
...,...,...,...
99992,199412636,19991108,38424176272
99994,199412636,19991122,38424176272
99996,199412662,19970407,217293
99997,199412662,19970429,390826471499


In [26]:
df2 = pd.DataFrame().assign(items=df_apriori['items'])
print(df2)

                                                  items
0                                         547476,547477
2                                                547477
3                                                 17499
4      172235,175765,232177,343588,348365,447446,534944
11                                  16909,494604,516699
...                                                 ...
99992                                      38424,176272
99994                                      38424,176272
99996                                            217293
99997                                     390826,471499
99999                                            155641

[55735 rows x 1 columns]


In [27]:
from itertools import combinations
from operator import itemgetter
import pandas as pd
from time import time


def perform_apriori(data, support_count):

    single_items = (data['items'].str.split(",", expand=True))\
        .apply(pd.value_counts).sum(axis=1).where(lambda value: value > support_count).dropna()

    apriori_data = pd.DataFrame(
        {'items': single_items.index.astype(int), 'support_count': single_items.values, 'set_size': 1})

    data['set_size'] = data['items'].str.count(",") + 1

    data['items'] = data['items'].apply(lambda row: set(map(int, row.split(","))))

    single_items_set = set(single_items.index.astype(int))

    for length in range(2, len(single_items_set) + 1):
        data = data[data['set_size'] >= length]
        d = data['items'] \
            .apply(lambda st: pd.Series(s if set(s).issubset(st) else None for s in combinations(single_items_set, length))) \
            .apply(lambda col: [col.dropna().unique()[0], col.count()] if col.count() >= support_count else None).dropna()
        if d.empty:
            break
        apriori_data = apriori_data.append(pd.DataFrame(
            {'items': list(map(itemgetter(0), d.values)), 'support_count': list(map(itemgetter(1), d.values)),
             'set_size': length}), ignore_index=True)

    return apriori_data


In [None]:
start = time()
print(perform_apriori(data=df2, support_count=10))
print(time() - start)