In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import re

# Load the data
df = pd.read_csv('rai_allprod_collumns_clean.csv')


In [3]:
# prepare data
df['Column'] = df['Column'].str.lower()
df = df[df['Column'].notna() & (df['Column'].str.strip() != '')]
df['Table'] = df['Table'].str.lower()

# Concatenate Schema and DBName columns
df['DBName_Schema'] = df['Schema'] + '_' + df['DBName']

In [4]:
# display(dFrame.head(5))
# print(dFrame.describe())
print(df.info())
print(df.isnull().sum())
# print(df.sum())

<class 'pandas.core.frame.DataFrame'>
Index: 11346885 entries, 0 to 11346896
Data columns (total 9 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   UID            int64 
 1   DBType         object
 2   Instance       object
 3   DBName         object
 4   Schema         object
 5   Table          object
 6   Column         object
 7   ColumnType     object
 8   DBName_Schema  object
dtypes: int64(1), object(8)
memory usage: 865.7+ MB
None
UID              0
DBType           0
Instance         0
DBName           0
Schema           0
Table            0
Column           0
ColumnType       0
DBName_Schema    0
dtype: int64


In [5]:
unique_counts = df['DBName_Schema'].value_counts()
print(unique_counts)
print(df['DBName_Schema'].sum())

DBName_Schema
dbo_WSI_HPCMDBu_P                                                                      587258
dbo_CM_PP5                                                                             506870
storage_SLA-Data                                                                       211344
dbo_CM_3P2                                                                              69608
dbo_OperationsManager                                                                   56395
                                                                                        ...  
dbo__META                                                                                   1
deleted_schema_60269b6d_2162_4456_b204_aeefb788c399_BPM_UASProcessMiningWarehouse_P         1
Bloomberg_MWH_Warehouse                                                                     1
identity_CxDB                                                                               1
schema_60269b6d_2162_4456_b204_aeefb788c399_BP

In [26]:
df = df.drop(columns=['UID', 'Schema', 'DBName'], inplace=False)

In [27]:
# Define features (X) and target (y)

In [29]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
X = vectorizer.fit_transform(df['Column'])

In [30]:
# Clustering using KMeans
kmeans = KMeans(n_clusters=10, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

In [31]:
# Identify columns likely to contain sensitive information
sensitive_keywords = ['ssn', 'social', 'security', 'dob', 'date_of_birth', 'credit', 'card', 'ccn', 'account', 'bank', 'email', 'phone', 'address']
df['Sensitive'] = df['Column'].apply(lambda x: any(keyword in x for keyword in sensitive_keywords))


In [32]:
# Flag potential compliance issues
def flag_compliance_issues(column_name):
    patterns = {
        'PII': ['ssn', 'social', 'security', 'dob', 'date_of_birth', 'email', 'phone', 'address'],
        'Financial': ['credit', 'card', 'ccn', 'account', 'bank']
    }
    for issue, keywords in patterns.items():
        if any(keyword in column_name for keyword in keywords):
            return issue
    return 'None'

df['Compliance_Issue'] = df['Column'].apply(flag_compliance_issues)


In [33]:
# Save the results
df.to_csv('rai_allprod_collumns_with_clusters_and_flags.csv', index=False)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,DBType,Instance,DBName,Table,Column,ColumnType,Cluster,Sensitive,Compliance_Issue
0,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,statehourly_d461f6afa87b4259b908b29da01ede5c,inredstatemilliseconds,int,0,False,
1,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,statehourly_d461f6afa87b4259b908b29da01ede5c,inyellowstatemilliseconds,int,0,False,
2,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,statehourly_d461f6afa87b4259b908b29da01ede5c,indisabledstatemilliseconds,int,0,False,
3,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,statehourly_d461f6afa87b4259b908b29da01ede5c,inplannedmaintenancemilliseconds,int,0,False,
4,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,statehourly_d461f6afa87b4259b908b29da01ede5c,inunplannedmaintenancemilliseconds,int,0,False,
