__The input file contains the following fields:__

| Name    | Description |
| -------- | ------- |
| UID  | Unique ID of the owning application    |
| DBType | Type of the Databse (MSSQL, MySQL, Oracle, DB2)     |
| Instance    | The instance of the Ddtabase |
| DBName | The name of the database |
| Schema | The schema where the table os located in the database |
| Table | The name of the table |
| Column | The name of a single column |
| ColumnType | The datatyp of the column |

| DB | UID |    DBType | Instance |    DBName |      Schema |      Table |           Column |         ColumnType | 
| ----- | ----- |    ----- | ----- |    ----- |      ----- |      ----- |           ----- |         ----- | 
| __MSSQL__ | UID |    DBType | Instance |    DBName |      Schema |      Table |           Column |         ColumnType | 
| __Oracle__ | uid |   dbtype |   |           dbname |      owner |       table_name |      column_name |     data_type |
| __MySQL__ | uid |    dbtype |    |          dbname |      owner |       table_name |      column_name |     data_type | 

In [None]:
!pip install pyLDAvis
!pip install wordninja

# Prepare the data

we have metadata about MSSQL, Oracle and MySQL Instances in CSV compressed in ZIP files - these CSVs must be merged into the same structure

In [9]:
# remove the existing data files
!rm -f data/*.csv  

# unzip the data files avialable
!unzip -o data/mssql_allprod_collumns.zip -d data/
!unzip -o data/oracle_metadata.zip -d data/
!unzip -o data/mysql_metadata.zip -d data/

Archive:  data/mssql_allprod_collumns.zip
  inflating: data/mssql_allprod_collumns.csv  
Archive:  data/oracle_metadata.zip
  inflating: data/oracle_metadata.csv  
Archive:  data/mysql_metadata.zip
  inflating: data/mysql_metadata.csv  


In [10]:
%%bash
# Prepare MSSQL data

INPUT_FILE="data/mssql_allprod_collumns.csv"
OUTPUT_FILE="data/mssql_allprod_collumns_no_third_column.csv.tmp"

head -3 $INPUT_FILE

# because the column separator is ',', we need to make sure e.g. `numeric(17,10)` is replaced with `numeric(17;10)` in the csv file
sed -E 's/\(([^)]*),([^)]*)\)/(\1;\2)/g' ${INPUT_FILE} > ${OUTPUT_FILE}
echo "The file ${INPUT_FILE} has been processed and saved as ${OUTPUT_FILE}"

# dropo the third column ('Instance') from the csv file (it does not exist in the other files)
# Use awk to drop the third column
awk -F, 'BEGIN {OFS=","} { $3=""; sub(",,", ","); print }' ${OUTPUT_FILE} > ${INPUT_FILE}

rm ${OUTPUT_FILE}

echo "The file ${INPUT_FILE} has been processed, the third column has been removed:"
head -3 $INPUT_FILE

UID,DBType,Instance,DBName,Schema,Table,Column,ColumnType
30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int
The file data/mssql_allprod_collumns.csv has been processed and saved as data/mssql_allprod_collumns_no_third_column.csv.tmp
The file data/mssql_allprod_collumns.csv has been processed, the third column has been removed:
﻿UID,DBType,DBName,Schema,Table,Column,ColumnType
30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int


In [11]:
%%bash
INPUT_FILE="data/oracle_metadata.csv"
OUTPUT_FILE="data/oracle_metadata.csv.tmp"
# sed '/das den Zins verursacht hat/d' $INPUT_FILE > $OUTPUT_FILE
sed '/RFA772203/d' $INPUT_FILE > $OUTPUT_FILE
mv $OUTPUT_FILE $INPUT_FILE
rm -f $OUTPUT_FILE

In [None]:
%%bash
# create the dataset to use for the analysis

# the number of records to use for each DB type, set to '-1' to use all records
NR_OF_RECORDS_EACH=-1

TARGET_FILE="data/db_metadata.csv"
echo "Creating the dataset..."

# drop the first line (column names) from the files (except the one from MSSQL)
tail -n +2 data/oracle_metadata.csv > data/oracle_metadata.csv.tmp
tail -n +2 data/mysql_metadata.csv > data/mysql_metadata.csv.tmp

mv data/oracle_metadata.csv.tmp data/oracle_metadata.csv
mv data/mysql_metadata.csv.tmp data/mysql_metadata.csv

if [ $NR_OF_RECORDS_EACH -gt 0 ]; then
    echo "Creating a small dataset with ${NR_OF_RECORDS_EACH} records of each files"
    head -${NR_OF_RECORDS_EACH} data/mssql_allprod_collumns.csv  >  ${TARGET_FILE}
    head -${NR_OF_RECORDS_EACH} data/oracle_metadata.csv         >> ${TARGET_FILE}
    head -${NR_OF_RECORDS_EACH} data/mysql_metadata.csv          >> ${TARGET_FILE}
else
    echo "Creating a dataset with all records of each files"
    cat data/mssql_allprod_collumns.csv data/oracle_metadata.csv data/mysql_metadata.csv > ${TARGET_FILE}
fi

NR_OF_LINES=$(wc -l ${TARGET_FILE})
echo "new dataset contains ${NR_OF_LINES}"

In [12]:
# Importing the required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import LatentDirichletAllocation
import wordninja
from nltk.corpus import stopwords

In [None]:
# Load the data
df = pd.read_csv('data/db_metadata.csv')
display(df.head())

Unnamed: 0,UID,DBType,DBName,Schema,Table,Column,ColumnType
0,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
1,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int
2,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InDisabledStateMilliseconds,int
3,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InPlannedMaintenanceMilliseconds,int
4,30230,MSSQL,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InUnplannedMaintenanceMilliseconds,int


### Prepare the data

In [None]:

# drop columns of known special tables
df = df[~df['TABLE_NAME'].str.contains('flyway_schema_history')]

In [None]:
# split the column names into words and drop short words
df['document_content'] = df['Column'].apply(
    lambda x: ' '.join([word for word in wordninja.split(x) if len(word) > 2])
)

print(df['document_content'].value_counts())
# df = df.drop(columns=['Column'], inplace=False)

# Concatenate 'Column' values for rows with the same 'DBName'/'Table' combination
df_grouped = df.groupby(['UID', 'DBType', 'DBName', 'Table'])['document_content'].agg(
    lambda x: ' '.join(x)
).reset_index()

print(df_grouped['document_content'].value_counts())


In [None]:
display(df_grouped.head())

### Calculate TF-IDF

In [None]:
final_stopwords_list = stopwords.words('english') + stopwords.words('german')

vectorizer = TfidfVectorizer(stop_words=final_stopwords_list, min_df=3, max_df=0.75, max_features=5000)
doc_term_matrix = vectorizer.fit_transform(df_grouped['document_content'])

print(doc_term_matrix.shape)

In [None]:
num_clusters = 5

lda = LatentDirichletAllocation(n_components=num_clusters, learning_method='online', random_state=42)
doc_cluster_matrix = lda.fit_transform(doc_term_matrix)

col_names = ['Cluster' + str(i) for i in range(num_clusters)]

doc_cluster_df = pd.DataFrame(doc_cluster_matrix, columns=col_names)
display(doc_cluster_df.head()) 


In [None]:
# Display the top words for each cluster
num_words = 10

for cluster, words in enumerate(lda.components_):
    word_total = words.sum()
    sorted_words = words.argsort()[::-1]
    print()
    print(f'Cluster {cluster}:')
    for i in range(0, num_words):
        word = vectorizer.get_feature_names_out()[sorted_words[i]]
        word_wight = words[sorted_words[i]]
        print(f'{word} {word_wight}')

In [None]:
#Visualize and analyze reuslts
import pyLDAvis
import pyLDAvis.lda_model


prepared_data = pyLDAvis.lda_model.prepare(lda, doc_term_matrix, vectorizer, mds='tsne', sort_topics=False, n_jobs = -1)
word_info = prepared_data.topic_info

#Print top 30 keywords
for topic in word_info.loc[word_info.Category != 'Default'].Category.unique():
    print(topic)
    print(word_info.loc[word_info.Category.isin([topic])].sort_values('logprob', ascending = False).Term.values[:30])
    print()

#To save prepared_data in an html panel (using https://panel.holoviz.org/)
pyLDAvis.save_html(prepared_data, 'panel.html')
print("Panel saved to panel.html")

# Backup

In [46]:
# prepare data
df['Column'] = df['Column'].str.lower()
df = df[df['Column'].notna() & (df['Column'].str.strip() != '')]
df['Table'] = df['Table'].str.lower()

# Concatenate Schema and DBName columns
df['DBName_Schema'] = df['Schema'] + '_' + df['DBName']

In [None]:
# display(dFrame.head(5))
# print(dFrame.describe())
print(df.info())
print(df.isnull().sum())
# print(df.sum())

In [None]:
unique_counts = df['DBName_Schema'].value_counts()
print(unique_counts)
print(df['DBName_Schema'].sum())

In [49]:
df = df.drop(columns=['UID', 'Schema', 'DBName'], inplace=False)

In [27]:
# Define features (X) and target (y)

In [50]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
X = vectorizer.fit_transform(df['Column'])

In [51]:
# Clustering using KMeans
kmeans = KMeans(n_clusters=10, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

In [52]:
# Identify columns likely to contain sensitive information
sensitive_keywords = ['ssn', 'social', 'security', 'dob', 'date_of_birth', 'credit', 'card', 'ccn', 'account', 'bank', 'email', 'phone', 'address']
df['Sensitive'] = df['Column'].apply(lambda x: any(keyword in x for keyword in sensitive_keywords))


In [53]:
# Flag potential compliance issues
def flag_compliance_issues(column_name):
    patterns = {
        'PII': ['ssn', 'social', 'security', 'dob', 'date_of_birth', 'email', 'phone', 'address'],
        'Financial': ['credit', 'card', 'ccn', 'account', 'bank']
    }
    for issue, keywords in patterns.items():
        if any(keyword in column_name for keyword in keywords):
            return issue
    return 'None'

df['Compliance_Issue'] = df['Column'].apply(flag_compliance_issues)


In [None]:
# Save the results
df.to_csv('rai_allprod_collumns_with_clusters_and_flags.csv', index=False)

# Display the first few rows of the dataframe
df.head()