# Help notebook to annotate the columns with the language

__This is only used if the origin data files are changed! The ZIP `metadata_with_lang.zip` alread contains annotated rows!__

The script reads the full file and uses the column 'TABLE_NAME' to detect the language and adds it in a new column ('LANG') at the end.



In [1]:
# install packages
!pip install lingua-language-detector
!pip install wordninja
!pip install panda

Collecting panda
  Downloading panda-0.3.1.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: panda
  Building wheel for panda (setup.py) ... [?25ldone
[?25h  Created wheel for panda: filename=panda-0.3.1-py3-none-any.whl size=7239 sha256=b5d9526cf59fe6ea02aee0a3529dcff4cc8e83dfd7f9f21574accf3afc97b363
  Stored in directory: /Users/domi/Library/Caches/pip/wheels/98/41/5b/6ca54e0b6a35e1b7248c12f56fcb753dfb7717fefaa0fb45f5
Successfully built panda
Installing collected packages: panda
Successfully installed panda-0.3.1


In [6]:
%%bash
# Prepare MSSQL data

INPUT_FILE="data/mssql_allprod_collumns.csv"
OUTPUT_FILE="data/metadata.tmp"

head -3 $INPUT_FILE

# because the column separator is ',', we need to make sure e.g. `numeric(17,10)` is replaced with `numeric(17;10)` in the csv file
sed -E 's/\(([^)]*),([^)]*)\)/(\1;\2)/g' ${INPUT_FILE} > ${OUTPUT_FILE}
echo "The file ${INPUT_FILE} has been processed and saved as ${OUTPUT_FILE}"

mv ${OUTPUT_FILE} ${INPUT_FILE}

UID,DBType,Instance,DBName,Schema,Table,Column,ColumnType
30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int
The file data/mssql_allprod_collumns.csv has been processed and saved as data/metadata.tmp


In [None]:
import pandas as pd
import wordninja
from lingua import Language, LanguageDetectorBuilder

# fileName = 'data/oracle_metadata.csv'
# fileName = 'data/mysql_metadata.csv'
fileName = 'data/mssql_allprod_collumns.csv'

# build a language detector to detect English and German
langdetector = LanguageDetectorBuilder.from_languages(
        Language.ENGLISH, 
        Language.GERMAN
    ).build()

# function to detect if a word is German
def isEnglish(word):
    detected_lang = langdetector.detect_language_of(word)
    if detected_lang == Language.ENGLISH:
        return True
    else:
        return False
    
# read csv, but drop first line
df = pd.read_csv(fileName, skiprows=1, names=['UID', 'DB_TYPE', 'INSTANCE', 'DB_NAME', 'SCHEMA', 'TABLE_NAME', 'COLUMN_NAME', 'DATA_TYPE'])

display(df.head())

df['SPLIT_TABLE_NAME'] = df['TABLE_NAME'].apply(lambda x: ' '.join(wordninja.split(x)))


# iterate over the column and check if the word is German or English and add the result to a new column
df['LANG'] = df['TABLE_NAME'].apply(lambda x: 'EN' if isEnglish(' '.join(wordninja.split(x))) else 'DE')

display(df.head())

# drop SPLIT_TABLE_NAME column
df.drop(columns=['SPLIT_TABLE_NAME'], inplace=True)

display(df.head())

# write the resulting DF to a new csv file
df.to_csv(f"{fileName}.lang.csv", index=False)

Unnamed: 0,UID,DB_TYPE,INSTANCE,DB_NAME,SCHEMA,TABLE_NAME,COLUMN_NAME,DATA_TYPE
0,30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InRedStateMilliseconds,int
1,30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InYellowStateMilliseconds,int
2,30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InDisabledStateMilliseconds,int
3,30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InPlannedMaintenanceMilliseconds,int
4,30230,MSSQL,R0015702\RCHSCOMP02,OperationsManagerDW,State,StateHourly_D461F6AFA87B4259B908B29DA01EDE5C,InUnplannedMaintenanceMilliseconds,int
