__The input file contains the following fields:__

| Name    | Description |
| -------- | ------- |
| UID  | Unique ID of the owning application    |
| DBType | Type of the Databse (MSSQL, MySQL, Oracle, DB2)     |
| Instance    | The instance of the Ddtabase |
| DBName | The name of the database |
| Schema | The schema where the table os located in the database |
| Table | The name of the table |
| Column | The name of a single column |
| ColumnType | The datatyp of the column |

| DB | UID |    DBType | Instance |    DBName |      Schema |      Table |           Column |         ColumnType | 
| ----- | ----- |    ----- | ----- |    ----- |      ----- |      ----- |           ----- |         ----- | 
| __MSSQL__ | UID |    DBType | Instance |    DBName |      Schema |      Table |           Column |         ColumnType | 
| __Oracle__ | uid |   dbtype |   |           dbname |      owner |       table_name |      column_name |     data_type |
| __MySQL__ | uid |    dbtype |    |          dbname |      owner |       table_name |      column_name |     data_type | 

In [64]:
!pip install pyLDAvis

  pid, fd = os.forkpty()




# Prepare the data

we have metadata about MSSQL, Oracle and MySQL Instances in CSV compressed in ZIP files - these CSVs must be merged into the same structure

In [65]:
# remove the existing data files
!rm -f data/*.csv  

# unzip the data files avialable
!unzip -o data/mssql_allprod_collumns.zip -d data/
!unzip -o data/oracle_metadata.zip -d data/
!unzip -o data/mysql_metadata.zip -d data/

zsh:1: no matches found: data/*.csv


  pid, fd = os.forkpty()


Archive:  data/mssql_allprod_collumns.zip
  inflating: data/mssql_allprod_collumns.csv  

  pid, fd = os.forkpty()



Archive:  data/oracle_metadata.zip
  inflating: data/oracle_metadata.csv  

  pid, fd = os.forkpty()



Archive:  data/mysql_metadata.zip
  inflating: data/mysql_metadata.csv  


  pid, fd = os.forkpty()


In [66]:
%%bash
# Prepare MSSQL data

INPUT_FILE="data/mssql_allprod_collumns.csv"
OUTPUT_FILE="data/mssql_allprod_collumns_no_third_column.csv.tmp"

head -3 $INPUT_FILE

# because the column separator is ',', we need to make sure e.g. `numeric(17,10)` is replaced with `numeric(17;10)` in the csv file
sed -E 's/\(([^)]*),([^)]*)\)/(\1;\2)/g' ${INPUT_FILE} > ${OUTPUT_FILE}
echo "The file ${INPUT_FILE} has been processed and saved as ${OUTPUT_FILE}"

# dropo the third column ('Instance') from the csv file (it does not exist in the other files)
# Use awk to drop the third column
awk -F, 'BEGIN {OFS=","} { $3=""; sub(",,", ","); print }' ${OUTPUT_FILE} > ${INPUT_FILE}

rm ${OUTPUT_FILE}

echo "The file ${INPUT_FILE} has been processed, the third column has been removed:"
head -3 $INPUT_FILE

UID,DBType,Instance,DBName,Schema,Table,Column,ColumnType
30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int
The file data/mssql_allprod_collumns.csv has been processed and saved as data/mssql_allprod_collumns_no_third_column.csv
The file data/mssql_allprod_collumns.csv has been processed, the third column has been removed:
﻿UID,DBType,DBName,Schema,Table,Column,ColumnType
30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int


In [70]:
%%bash
# create a small dataset for testing
# number of records in a small dataset 
NR_OF_RECORDS_EACH=500
echo "Creating a small dataset ${NR_OF_RECORDS_EACH} records of each files"

head -${NR_OF_RECORDS_EACH} data/mssql_allprod_collumns.csv  >  data/small.csv
head -${NR_OF_RECORDS_EACH} data/oracle_metadata.csv         >> data/small.csv
head -${NR_OF_RECORDS_EACH} data/mysql_metadata.csv         >> data/small.csv

NR_OF_LINES=$(wc -l data/small.csv)
echo "new dataset contains ${NR_OF_LINES}"

Creating a small dataset 500 records of each files
new dataset contains     1500 data/small.csv


In [72]:
# Importing the required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import LatentDirichletAllocation
import wordninja

In [73]:
# Load the data
# df = pd.read_csv('data/rai_allprod_collumns_clean.csv')
df = pd.read_csv('data/small.csv')
display(df.head())

Unnamed: 0,UID,DBType,DBName,Schema,Table,Column,ColumnType
0,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
1,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int
2,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InDisabledStateMilliseconds,int
3,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InPlannedMaintenanceMilliseconds,int
4,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InUnplannedMaintenanceMilliseconds,int


### Prepare the data

In [74]:
# split the column names into words
df['document_content'] = df['Column'].apply(
    lambda x: ' '.join(wordninja.split(x))
)

# Concatenate 'Column' values for rows with the same 'DBName'/'Table' combination
df_grouped = df.groupby(['DBName', 'Table'])['document_content'].agg(
    lambda x: ' '.join(x)
).reset_index()


### Calculate TF-IDF

In [75]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.75, max_features=5000)
doc_term_matrix = vectorizer.fit_transform(df_grouped['document_content'])

print(doc_term_matrix.shape)

(166, 171)


In [76]:
num_clusters = 5

lda = LatentDirichletAllocation(n_components=num_clusters, learning_method='online', random_state=42)
doc_cluster_matrix = lda.fit_transform(doc_term_matrix)

col_names = ['Cluster' + str(i) for i in range(num_clusters)]

doc_cluster_df = pd.DataFrame(doc_cluster_matrix, columns=col_names)
display(doc_cluster_df.head()) 


Unnamed: 0,Cluster0,Cluster1,Cluster2,Cluster3,Cluster4
0,0.79647,0.050714,0.051286,0.050785,0.050746
1,0.804535,0.048729,0.049145,0.048757,0.048835
2,0.724749,0.068389,0.069674,0.06868,0.068509
3,0.784835,0.053653,0.054063,0.053757,0.053692
4,0.039555,0.038482,0.060669,0.039879,0.821415


In [77]:
# Display the top words for each cluster
num_words = 10

for cluster, words in enumerate(lda.components_):
    word_total = words.sum()
    sorted_words = words.argsort()[::-1]
    print()
    print(f'Cluster {cluster}:')
    for i in range(0, num_words):
        word = vectorizer.get_feature_names_out()[sorted_words[i]]
        word_wight = words[sorted_words[i]]
        print(f'{word} {word_wight}')


Cluster 0:
data 4.373261581563654
time 4.32926849158901
type 3.716002560949461
succeeded 3.287584665371378
row 2.9848338610709866
jobs 2.801548001142801
event 2.553780992065522
total 2.533578081878062
modified 2.5184782791357985
delete 2.428792609436719

Cluster 1:
fk 6.4726655849085795
ich 4.26023530585384
zu 4.256642947415969
itz 3.0917644103234903
hoe 3.002772219166134
iv 2.40449201419941
code 2.1916238595668496
nun 2.18617858966092
kt 2.151880502052191
ch 2.1424925104366315

Cluster 2:
modified 16.79903254540457
created 16.329484336333245
date 11.0445368581387
active 8.945216308658537
version 8.3975943832049
gui 4.794360100594631
status 4.592994812184231
rahm 3.8264100148628097
trag 3.8174242225616766
enver 3.812699992656919

Cluster 3:
milliseconds 7.263341550176597
state 6.03666533888095
maintenance 2.622784492898246
health 2.483902488324434
lifecycle 2.2798400067722557
row 1.8545351197254898
01 1.729496376457224
ich 1.5730823600268196
act 1.5001234201232843
disabled 1.440811122

In [80]:
#Visualize and analyze reuslts
import pyLDAvis
import pyLDAvis.lda_model


panel = pyLDAvis.lda_model.prepare(lda, doc_term_matrix, vectorizer, mds='tsne', sort_topics=False, n_jobs = -1)
word_info = panel.topic_info


#Print top 30 keywords
for topic in word_info.loc[word_info.Category != 'Default'].Category.unique():
    print(topic)
    print(word_info.loc[word_info.Category.isin([topic])].sort_values('logprob', ascending = False).Term.values[:30])
    print()

#To save panel in html (using https://panel.holoviz.org/)
pyLDAvis.save_html(panel, 'panel.html')
print("Panel saved to panel.html")

Topic1
['data' 'time' 'type' 'succeeded' 'row' 'jobs' 'event' 'total' 'modified'
 'delete' 'source' 'date' 'process' 'memory' 'client' 'avg' 'events'
 'aggregation' 'num' 'currency' 'partner' 'valid' 'deleted' 'app' 'xml'
 'dt' 'version' 'cpu' 'level' 'size']

Topic2
['fk' 'ich' 'zu' 'itz' 'hoe' 'iv' 'code' 'nun' 'kt' 'ch' 'ze' 'en' 'ver'
 'rei' 'bung' 'sch' 'le' 'sprache' 'red' 'kon' 'status' 'vor' 'jah'
 'created' 'nach' 'von' 'parent' 'material' 'act' 'fs']

Topic3
['modified' 'created' 'date' 'active' 'version' 'gui' 'status' 'rahm'
 'trag' 'enver' 'nun' 'datum' 'ze' 'ty' 'content' 'fr' 'ich' 'rungs'
 'iche' 'vers' 'media' 'xml' 'filename' 'mer' 'partner' 'file' 'data'
 'art' 'nr' 'size']

Topic4
['milliseconds' 'state' 'maintenance' 'health' 'lifecycle' 'row' '01'
 'ich' 'act' 'disabled' 'planned' 'yellow' 'unplanned' 'red' 'sp'
 'service' 'end' 'unavailable' 'monitor' 'interval' 'entity' 'managed'
 '03' 'hourly' 'user' 'date' 'time' 'corre' 'object' '02']

Topic5
['installed' 'jo

In [46]:
# prepare data
df['Column'] = df['Column'].str.lower()
df = df[df['Column'].notna() & (df['Column'].str.strip() != '')]
df['Table'] = df['Table'].str.lower()

# Concatenate Schema and DBName columns
df['DBName_Schema'] = df['Schema'] + '_' + df['DBName']

In [47]:
# display(dFrame.head(5))
# print(dFrame.describe())
print(df.info())
print(df.isnull().sum())
# print(df.sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1499 entries, 0 to 1498
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   UID               1499 non-null   object
 1   DBType            1499 non-null   object
 2   Instance          1499 non-null   object
 3   DBName            1499 non-null   object
 4   Schema            1499 non-null   object
 5   Table             1499 non-null   object
 6   Column            1499 non-null   object
 7   ColumnType        499 non-null    object
 8   document_content  1499 non-null   object
 9   DBName_Schema     1499 non-null   object
dtypes: object(10)
memory usage: 117.2+ KB
None
UID                    0
DBType                 0
Instance               0
DBName                 0
Schema                 0
Table                  0
Column                 0
ColumnType          1000
document_content       0
DBName_Schema          0
dtype: int64


In [48]:
unique_counts = df['DBName_Schema'].value_counts()
print(unique_counts)
print(df['DBName_Schema'].sum())

DBName_Schema
State_OperationsManagerDW                175
dbo_OperationsManagerDW                   46
ProcessMonitoring_OperationsManagerDW     45
Alert_OperationsManagerDW                 42
Vista_OperationsManagerDW                 35
                                        ... 
hardware_bestandteile_pmyrb_web            2
hardware_optionen_pmyrb_web                2
table_name_owner                           2
headset_optionen_pmyrb_web                 2
ABA03_BASEFILE_OWN_NMAT_UAS                2
Name: count, Length: 120, dtype: int64
State_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWWin7_OperationsManagerDWWin7_OperationsManagerDWWin7_OperationsManagerDWWin7_OperationsManagerDWWin7_OperationsManagerDWWin7_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_OperationsManagerDWState_Op

In [49]:
df = df.drop(columns=['UID', 'Schema', 'DBName'], inplace=False)

In [27]:
# Define features (X) and target (y)

In [50]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
X = vectorizer.fit_transform(df['Column'])

In [51]:
# Clustering using KMeans
kmeans = KMeans(n_clusters=10, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

In [52]:
# Identify columns likely to contain sensitive information
sensitive_keywords = ['ssn', 'social', 'security', 'dob', 'date_of_birth', 'credit', 'card', 'ccn', 'account', 'bank', 'email', 'phone', 'address']
df['Sensitive'] = df['Column'].apply(lambda x: any(keyword in x for keyword in sensitive_keywords))


In [53]:
# Flag potential compliance issues
def flag_compliance_issues(column_name):
    patterns = {
        'PII': ['ssn', 'social', 'security', 'dob', 'date_of_birth', 'email', 'phone', 'address'],
        'Financial': ['credit', 'card', 'ccn', 'account', 'bank']
    }
    for issue, keywords in patterns.items():
        if any(keyword in column_name for keyword in keywords):
            return issue
    return 'None'

df['Compliance_Issue'] = df['Column'].apply(flag_compliance_issues)


In [54]:
# Save the results
df.to_csv('rai_allprod_collumns_with_clusters_and_flags.csv', index=False)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,DBType,Instance,Table,Column,ColumnType,document_content,DBName_Schema,Cluster,Sensitive,Compliance_Issue
0,MSSQL,R0015702\RCHSCOMP02,statehourly_d461f6afa87b4259b908b29da01ede5c,inredstatemilliseconds,int,In Red State Milliseconds,State_OperationsManagerDW,0,False,
1,MSSQL,R0015702\RCHSCOMP02,statehourly_d461f6afa87b4259b908b29da01ede5c,inyellowstatemilliseconds,int,In Yellow State Milliseconds,State_OperationsManagerDW,0,False,
2,MSSQL,R0015702\RCHSCOMP02,statehourly_d461f6afa87b4259b908b29da01ede5c,indisabledstatemilliseconds,int,In Disabled State Milliseconds,State_OperationsManagerDW,5,False,
3,MSSQL,R0015702\RCHSCOMP02,statehourly_d461f6afa87b4259b908b29da01ede5c,inplannedmaintenancemilliseconds,int,In Planned Maintenance Milliseconds,State_OperationsManagerDW,0,False,
4,MSSQL,R0015702\RCHSCOMP02,statehourly_d461f6afa87b4259b908b29da01ede5c,inunplannedmaintenancemilliseconds,int,In Unplanned Maintenance Milliseconds,State_OperationsManagerDW,0,False,
