In [1]:
# (C) Kris - Simple Cognos Analytics Audit Log transformation in Python3
# for detecting which tables in databases were read by whom and when.

In [2]:
# Funkcja parsująca treści wykonanych zapytań w celu wyciągnięcia nazw czytanych tabel
import re
def tables_in_query(sql_str):
    # make sure we have string
    sql_str = str(sql_str)
    # remove the /* */ comments
    q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", sql_str)
    # remove whole line -- and # comments
    lines = [line for line in q.splitlines() if not re.match("^\s*(--|#)", line)]
    # remove trailing -- and # comments
    q = " ".join([re.split("--|#", line)[0] for line in lines])
    # split on blanks, parens and semicolons
    tokens = re.split(r"[\s)(;]+", q)
    # scan the tokens. if we see a FROM or JOIN, we set the get_next
    # flag, and grab the next one (unless it's SELECT).
    result = set()
    get_next = False
    for tok in tokens:
        if get_next:
            if tok.lower() not in ["", "select"]:
                result.add(tok)
            get_next = False
        get_next = tok.lower() in ["from", "join"]
    return result

In [13]:
# Podłączenie do bazy danych z logami auditowymi LIVE:
import pymssql  
conn = pymssql.connect(server='192.168.1.22', user='sa', password='PASSWORD', database='cog11audit')

In [4]:
# Zapytanie wybierające dane ze szczegółami zapytań:
query = "SELECT COGIPF_USERLOGON.COGIPF_USERNAME AS UNAME, COGIPF_NATIVEQUERY.COGIPF_LOCALTIMESTAMP AS QTIME, \
                         COGIPF_RUNREPORT.COGIPF_REPORTNAME AS RNAME, COGIPF_NATIVEQUERY.COGIPF_SESSIONID AS SESID, \
                         COGIPF_NATIVEQUERY.COGIPF_REQUESTSTRING AS QRSTR \
FROM            COGIPF_RUNREPORT RIGHT OUTER JOIN \
                         COGIPF_NATIVEQUERY ON COGIPF_RUNREPORT.COGIPF_REQUESTID = COGIPF_NATIVEQUERY.COGIPF_REQUESTID LEFT OUTER JOIN \
                         COGIPF_USERLOGON ON COGIPF_NATIVEQUERY.COGIPF_SESSIONID = COGIPF_USERLOGON.COGIPF_SESSIONID \
WHERE        (NOT (COGIPF_NATIVEQUERY.COGIPF_SESSIONID IS NULL)) AND (NOT (COGIPF_USERLOGON.COGIPF_USERNAME IS NULL)) AND (NOT (COGIPF_NATIVEQUERY.COGIPF_REQUESTSTRING IS NULL)) \
ORDER BY QTIME DESC;"

In [5]:
## Wszystkie dane z SQLa do DataFrame Pandas - uwaga na lokalną pamięć w Pythonie...
import pandas as pd
df = pd.read_sql(query, conn)
df.count()

UNAME    332
QTIME    332
RNAME    332
SESID    332
QRSTR    332
dtype: int64

In [6]:
df.head(3)

Unnamed: 0,UNAME,QTIME,RNAME,SESID,QRSTR
0,cognos,2020-03-08 16:35:48.827,report[@name='lista'],6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,"<thirdparty><![CDATA[select ""coguda03"".""COMPAN..."
1,cognos,2020-03-08 16:35:48.827,report[@name='lista'],6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,"<thirdparty><![CDATA[select ""coguda03"".""COMPAN..."
2,cognos,2020-03-08 16:35:48.827,report[@name='lista'],6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,"<thirdparty><![CDATA[select ""coguda03"".""COMPAN..."


In [7]:
# Zostawiamy tylko unikalne rekordy w DataFrame:
df.drop_duplicates(inplace = True)
# Czyszczenie kolumny z nazwą raportu z niepotrzebnych znaczków:
df['RNAME'] = df['RNAME'].map(lambda RNAME: str(RNAME).replace('report[@name=', '').replace(']', '').replace('\'', ''))
# Wyłuskanie nazw tabel poprzez zastosowanie funkcji na kolumnie z treściami SQLi = QRSTR:
df['QRSTR'] = df['QRSTR'].map(lambda QRSTR: str(tables_in_query(str(QRSTR))))
# Czyszczenie powyższej kolumny z nazwami table z pozostałych znaczków:
df['QRSTR'] = df['QRSTR'].map(lambda QRSTR: str(QRSTR).replace('{', '').replace('\'', '').replace('}', ''))
df.count()

UNAME    97
QTIME    97
RNAME    97
SESID    97
QRSTR    97
dtype: int64

In [8]:
df.head(7)

Unnamed: 0,UNAME,QTIME,RNAME,SESID,QRSTR
0,cognos,2020-03-08 16:35:48.827,lista,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,"""GOSALES"".""SALES_REGION"""
4,cognos,2020-03-08 16:34:51.557,Market Size - modified,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,[C:/Cognos/c11_64\\.\\bin\\../temp\\dmbTemp_72...
8,cognos,2020-03-08 16:34:51.323,Market Size - modified,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,"""XTR"".""dbo"".""DIM_PRODUCT"""
12,cognos,2020-03-08 16:34:51.277,Market Size - modified,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,"""XTR"".""dbo"".""DIM_CUSTOMER"""
16,bank,2020-03-08 16:34:04.290,Accounts Receivables,610CDBE79BC74FADB8E0C1FC86D94C073B3CAB65C3D151...,set()
18,bank,2020-03-08 16:34:03.120,Accounts Receivables,610CDBE79BC74FADB8E0C1FC86D94C073B3CAB65C3D151...,"""dbo"".""Customer"", ""dbo"".""Fiscal_Period"", ""dbo""..."
20,bank,2020-03-08 16:34:02.870,Accounts Receivables,610CDBE79BC74FADB8E0C1FC86D94C073B3CAB65C3D151...,"""dbo"".""Customer"""


In [9]:
# Zapis wyniku do nowej tabeli bazy danych z auditem za pomocą gotowca z sqlalchemy - trzeba dokładnie przetestować:
import sqlalchemy
# engine = sqlalchemy.create_engine(conn)
engine = sqlalchemy.create_engine('mssql+pymssql://sa:PASSWORD@192.168.1.22:1433/cog11audit')
df.to_sql(name = 'AudiTab1', con = engine, if_exists = 'replace', index = False)
conn.commit()

In [10]:
# Utworzenie nowej struktury do drugiej tabelki pod raportowanie w innym układzie:
ndf = pd.DataFrame(df.QRSTR.str.split(',').tolist(), index=[df.UNAME, df.QTIME, df.RNAME, df.SESID]).stack()
ndf = ndf.reset_index()
ndf.columns = ['UNAME', 'QTIME', 'RNAME', 'SESID', 'LVL', 'TAB']
ndf.drop(['LVL'], axis=1)
ndf['TAB'] = ndf['TAB'].map(lambda TAB: str(TAB).replace('[', '').replace(']', '').replace('\"', ''))
ndf['TAB'] = ndf['TAB'].map(lambda TAB: str(TAB).strip())
ndf.head(7)

Unnamed: 0,UNAME,QTIME,RNAME,SESID,LVL,TAB
0,cognos,2020-03-08 16:35:48.827,lista,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,0,GOSALES.SALES_REGION
1,cognos,2020-03-08 16:34:51.557,Market Size - modified,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,0,C:/Cognos/c11_64\\.\\bin\\../temp\\dmbTemp_727...
2,cognos,2020-03-08 16:34:51.323,Market Size - modified,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,0,XTR.dbo.DIM_PRODUCT
3,cognos,2020-03-08 16:34:51.277,Market Size - modified,6C7D7695D0736A3B93F955671A2920624A55FCA34175BB...,0,XTR.dbo.DIM_CUSTOMER
4,bank,2020-03-08 16:34:04.290,Accounts Receivables,610CDBE79BC74FADB8E0C1FC86D94C073B3CAB65C3D151...,0,set()
5,bank,2020-03-08 16:34:03.120,Accounts Receivables,610CDBE79BC74FADB8E0C1FC86D94C073B3CAB65C3D151...,0,dbo.Customer
6,bank,2020-03-08 16:34:03.120,Accounts Receivables,610CDBE79BC74FADB8E0C1FC86D94C073B3CAB65C3D151...,1,dbo.Fiscal_Period


In [11]:
ndf.to_sql(name = 'AudiTab2', con = engine, if_exists = 'replace', index = False)
conn.commit()
ndf.count()

UNAME    296
QTIME    296
RNAME    296
SESID    296
LVL      296
TAB      296
dtype: int64

In [12]:
# All Done:
conn.close()
# END #