In [1]:
import os
import logging
import time
import psycopg2

import pandas as pd
import numpy as np 
from sqlalchemy import create_engine

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
 

def connect_postgres():
    database = os.environ['POSTGRES_DB']
    user = os.environ['POSTGRES_USER']
    password = os.environ['POSTGRES_PASSWORD']
    host = os.environ['POSTGRES_SERVER']
    port = 5432

    exc, conn, engine = None, None, None

    for _ in range(5):
        try:
            conn = psycopg2.connect(
                database=database, user=user, password=password, host=host, port=port)
        except Exception as e:
            logging.warning("Error connecting to postgres, will retry in 3 sec: %s", e)
            time.sleep(3)
            exc = e
        else:
            logging.info("Connected...")
            logging.info("Everything goes well from Postgres, you're a fu*** pro...")
            
            engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
                user, password, host, port, database
            ))
            break
    else:
        logging.error("Unable to connect to  %s DB", database)
        raise exc
    
    return [conn, engine]


In [2]:
[conn, engine] = connect_postgres()

INFO:root:Connected...
INFO:root:Everything goes well from Postgres, you're a fu*** pro...


In [3]:
df = pd.read_sql("""
    SELECT code, trans_date_id as date, copy.id as books
    FROM public.copytransaction, public.copy, public.student, public.title, public.transactiontype
    WHERE 
        trans_borrower_code = student.id and
        trans_copy_code_id = copy.id and 
        trans_tittle_code_id = title.id and 
        trans_type_id = transactiontype.id and 
        trans_location_code_id = 5 and 
        trans_type_code IN ('ISS', 'REN', 'NON', 'PLOAN')
    GROUP BY code, trans_date_id, copy.id
""", con=conn)



In [4]:
df

Unnamed: 0,code,date,books
0,198224610,20000928,547476
1,198224610,20000928,547477
2,198224610,20001014,547477
3,198224610,20001117,17499
4,198224610,20001120,172235
...,...,...,...
3352753,201880023,20180305,831823
3352754,201880026,20180305,832497
3352755,201880026,20180307,823259
3352756,201880026,20180313,834960


In [5]:
df = df.astype({col: 'int32' for col in df.select_dtypes('int64').columns})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3352758 entries, 0 to 3352757
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   code    int32
 1   date    int32
 2   books   int32
dtypes: int32(3)
memory usage: 38.4 MB


In [6]:
#exploring the data

# checking null values
df.isnull().sum()

code     0
date     0
books    0
dtype: int64

In [12]:
import numpy as np
import math
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

chunk_max_size = 10000
chunks = int(math.ceil(len(df) / chunk_max_size))

rules = []

In [13]:
for df_chunk in np.array_split(df, chunks):
    df_chunk.books = df_chunk.books.transform(lambda x: [x])    
    library = df_chunk.groupby(['code','date']).sum()['books'].reset_index(drop=True)
    
    encoder = TransactionEncoder()
    transform_library = encoder.fit(library).transform(library)
    transactions = pd.DataFrame(transform_library, columns=encoder.columns_)
    
    # TODO: understand this very well
    frequent_itemsets = fpgrowth(transactions, min_support=10/len(df_chunk), use_colnames=True, max_len=2)
    if not frequent_itemsets.empty:
        rule = association_rules(frequent_itemsets)
        
        if not rule.empty:
            rules.append(rule)

In [20]:
result = pd.concat(rules).reset_index(drop=True)
result["antecedents"] = result["antecedents"].apply(lambda x: list(x)[0]).astype("unicode")
result["consequents"] = result["consequents"].apply(lambda x: list(x)[0]).astype("unicode")
result

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,434760,484902,0.004586,0.004985,0.003988,0.869565,174.434783,0.003965,7.628448
1,537195,434760,0.001595,0.004586,0.001595,1.000000,218.043478,0.001588,inf
2,537195,484902,0.001595,0.004985,0.001595,1.000000,200.600000,0.001587,inf
3,343961,418195,0.002991,0.003589,0.002792,0.933333,260.037037,0.002781,14.946162
4,468301,183,0.002193,0.001994,0.001994,0.909091,455.909091,0.001990,10.978066
...,...,...,...,...,...,...,...,...,...
11796,630106,56242,0.001618,0.001618,0.001618,1.000000,617.900000,0.001616,inf
11797,592403,637924,0.001133,0.001780,0.001133,1.000000,561.727273,0.001131,inf
11798,604817,562758,0.001188,0.002546,0.001018,0.857143,336.685714,0.001015,6.982179
11799,817166,813410,0.001188,0.002037,0.001018,0.857143,420.857143,0.001016,6.985743


In [21]:
# TODO: save association rules to a database

result.to_csv('association_rules.csv', index=True, sep='*')