In [1]:
import os
import logging
import time
import psycopg2

import pandas as pd
import numpy as np 
from sqlalchemy import create_engine

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
 

def connect_postgres():
    database = os.environ['POSTGRES_DB']
    user = os.environ['POSTGRES_USER']
    password = os.environ['POSTGRES_PASSWORD']
    host = os.environ['POSTGRES_SERVER']
    port = 5432

    exc, conn, engine = None, None, None

    for _ in range(5):
        try:
            conn = psycopg2.connect(
                database=database, user=user, password=password, host=host, port=port)
        except Exception as e:
            logging.warning("Error connecting to postgres, will retry in 3 sec: %s", e)
            time.sleep(3)
            exc = e
        else:
            logging.info("Connected...")
            logging.info("Everything goes well from Postgres, you're a fu*** pro...")
            
            engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
                user, password, host, port, database
            ))
            break
    else:
        logging.error("Unable to connect to  %s DB", database)
        raise exc
    
    return [conn, engine]


In [2]:
[conn, engine] = connect_postgres()

INFO:root:Connected...
INFO:root:Everything goes well from Postgres, you're a fu*** pro...


In [3]:
df = pd.read_sql("""
    SELECT code, trans_date_id as date, copy.id as books
    FROM public.copytransaction, public.copy, public.student, public.title, public.transactiontype
    WHERE 
        trans_borrower_code = student.id and
        trans_copy_code_id = copy.id and 
        trans_tittle_code_id = title.id and 
        trans_type_id = transactiontype.id and 
        trans_type_code IN ('ISS', 'REN', 'NON', 'PLOAN')
    GROUP BY code, trans_date_id, copy.id
""", con=conn)



In [4]:
df

Unnamed: 0,code,date,books
0,198224610,20000928,547476
1,198224610,20000928,547477
2,198224610,20001014,547477
3,198224610,20001117,17499
4,198224610,20001120,172235
...,...,...,...
4360858,201880026,20180305,832497
4360859,201880026,20180307,823259
4360860,201880026,20180313,834960
4360861,201880026,20180320,834962


In [5]:
df = df.astype({col: 'int32' for col in df.select_dtypes('int64').columns})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4360863 entries, 0 to 4360862
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   code    int32
 1   date    int32
 2   books   int32
dtypes: int32(3)
memory usage: 49.9 MB


In [6]:
#exploring the data

# checking null values
df.isnull().sum()

code     0
date     0
books    0
dtype: int64

In [7]:
import numpy as np
import math
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

chunk_max_size = 40000
chunks = int(math.ceil(len(df) / chunk_max_size))

rules = []

In [12]:
for df_chunk in np.array_split(df, chunks):
    # min support is 3 items at least
    min_support = 3/len(df_chunk)
    
    df_chunk.books = df_chunk.books.transform(lambda x: [x])    
    library = df_chunk.groupby(['code','date']).sum()['books'].reset_index(drop=True)
    
    # Hot encode
    encoder = TransactionEncoder()
    transform_library = encoder.fit(library).transform(library)
    transactions = pd.DataFrame(transform_library, columns=encoder.columns_)
    
    # TODO: understand this very well
    frequent_itemsets = fpgrowth(transactions, min_support=min_support, use_colnames=True, max_len=2)
    if not frequent_itemsets.empty:
        rule = association_rules(frequent_itemsets, metric='confidence', min_threshold=0)
        rule['support_transactions_antecedent'] = rule['antecedent support'] * len(df_chunk) # calculate the # of transactions
        rule['support_transactions_consecuente'] = rule['consequent support'] * len(df_chunk) # calculate the # of transactions
        rule['support_transactions'] = rule['support'] * len(df_chunk) # calculate the # of transactions
        
        if not rule.empty:
            rules.append(rule)


In [14]:
result = pd.concat(rules).reset_index(drop=True)
result["antecedents"] = result["antecedents"].apply(lambda x: list(x)[0]).astype("unicode")
result["consequents"] = result["consequents"].apply(lambda x: list(x)[0]).astype("unicode")
result

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,support_transactions_antecedent,support_transactions_consecuente,support_transactions
0,159201,500919,0.000091,0.000136,0.000091,1.000000,7336.333333,0.000091,inf,3.602617,5.403926,3.602617
1,500919,159201,0.000136,0.000091,0.000091,0.666667,7336.333333,0.000091,2.999727,5.403926,3.602617,3.602617
2,42097,178775,0.000136,0.000091,0.000091,0.666667,7336.333333,0.000091,2.999727,5.403926,3.602617,3.602617
3,178775,42097,0.000091,0.000136,0.000091,1.000000,7336.333333,0.000091,inf,3.602617,5.403926,3.602617
4,189961,422662,0.000363,0.000182,0.000091,0.250000,1375.562500,0.000091,1.333091,14.410468,7.205234,3.602617
...,...,...,...,...,...,...,...,...,...,...,...,...
475419,798470,757896,0.000114,0.000189,0.000076,0.666667,3521.600000,0.000076,2.999432,4.502953,7.504922,3.001969
475420,725788,833606,0.000076,0.000454,0.000076,1.000000,2201.000000,0.000076,inf,3.001969,18.011813,3.001969
475421,833606,725788,0.000454,0.000076,0.000076,0.166667,2201.000000,0.000076,1.199909,18.011813,3.001969,3.001969
475422,700173,570318,0.000076,0.000076,0.000076,1.000000,13206.000000,0.000076,inf,3.001969,3.001969,3.001969


In [18]:
result.sort_values('leverage')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,support_transactions_antecedent,support_transactions_consecuente,support_transactions
429216,789093,829837,0.001629,0.007670,0.000079,0.048780,6.360167,0.000067,1.043219,64.592434,304.057066,3.150850
429217,829837,789093,0.007670,0.001629,0.000079,0.010363,6.360167,0.000067,1.008825,304.057066,64.592434,3.150850
429231,789095,829861,0.002027,0.005444,0.000079,0.039216,7.203091,0.000068,1.035150,80.346686,215.833254,3.150850
429230,829861,789095,0.005444,0.002027,0.000079,0.014599,7.203091,0.000068,1.012758,215.833254,80.346686,3.150850
429214,789093,829861,0.001629,0.005444,0.000079,0.048780,8.959943,0.000071,1.045559,64.592434,215.833254,3.150850
...,...,...,...,...,...,...,...,...,...,...,...,...
211212,578656,57794,0.002839,0.002885,0.002793,0.983871,341.059652,0.002785,61.821146,112.547644,114.362929,110.732360
211234,578387,578375,0.002885,0.002976,0.002839,0.984127,330.651526,0.002830,62.812491,114.362929,117.993498,112.547644
211235,578375,578387,0.002976,0.002885,0.002839,0.953846,330.651526,0.002830,21.604164,117.993498,114.362929,112.547644
135093,143117,200692,0.003235,0.003147,0.003058,0.945205,300.388964,0.003048,18.192574,128.257933,124.744017,121.230101


In [16]:
# TODO: save association rules to a database
result.to_csv('association_rules.csv', index=True, sep='*')