In [1]:
import time
import glob
import re
import json
from gensim.parsing.preprocessing import strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, stem_text
from collections import defaultdict
import string
import numpy as np
import pandas as pd
from langdetect import detect
from nltk.stem.snowball import SnowballStemmer
import nltk

In [2]:
# start time to save start point of the preprocessing
start_time = time.time()

# define the document path of the json files to process
document_paths = glob.glob("*.json")



In [3]:
items_from_all_tables = {}
for document_path in document_paths: # iterate through to path to access all json files
    document_name = re.split("/",document_path)[-1]
    with open(document_path, "rb") as handle:
        tables = json.load(handle)
    for table_id in tables: # iterate through all tables in a json file
        items = [cell["text"] for cell in tables[table_id]["cells"].values() if cell["column"] == 0] #save text of the table in items
        items_from_all_tables[str(document_name+"_"+table_id)] = items #save the text by using the document_name and table_id as an identifier

In [4]:
# define remove punctuation function
def remove_all_punctuation(word: str) -> str:
    for p in string.punctuation:
        word = str(word).replace(p, "")
    return word

In [5]:
# remove_all_but_alpha function
def remove_all_but_alpha(word: str) -> str:
    word = re.sub("[^aA-zZäöü ]", "", word)
    return word

In [6]:
# define function to clean text
def clean_text(text):
    """ Remove punctuation, numbers, whitespace, small words"""
    text = remove_all_punctuation(text)
    text = remove_all_but_alpha(text)
    word_list = text.split()
    word_list = [word.lower() for word in word_list if len(word)>2]
    text_cleaned = " ".join(word_list)
    return text_cleaned

In [7]:
# Clean all items
clean_items_from_tables = defaultdict(list)
for table_id, item_list in items_from_all_tables.items():
    clean_items_from_tables[table_id] = []
    for item in item_list:
        item = clean_text(item)
        if item:
            clean_items_from_tables[table_id].append(item)

In [8]:
items_per_table = [" ".join(text) for text in clean_items_from_tables.values()]
key_per_table = ["".join(key) for key in clean_items_from_tables.keys()]
items_per_table_array = np.array(items_per_table)

In [9]:
# initalize a dataframe to store the entries of each table in a row
df = pd.DataFrame(index=np.arange(len(items_per_table_array)))
df['content']=None
df['content_stemmed']=" "
df['id']=None
df['table_id']=None
df['json']=None
df['type']=None

In [10]:
# filling the dataframe with the data
for i in range(len(df)):
    df['content'][i]=items_per_table_array[i]
    df['id'][i]=key_per_table[i].split(".")   
    df['json'][i]=df['id'][i][0]
    df['table_id'][i]=df['id'][i][1]
df=df.drop(columns=['id'])

In [11]:
# remove empty entries
print("entries before:"+ str(len(df)))
df['content1']=df['content']
df['content1'].replace('', np.nan, inplace=True)
df.dropna(subset=['content1'], inplace=True)
print("entries after :"+str(len(df)))
df=df.drop(columns=['content1'])
df=df.reset_index()

entries before:4135
entries after :4091


In [12]:
df.head()

Unnamed: 0,index,content,content_stemmed,table_id,json,type
0,0,name gmbhweling,,json_table_0,0,
1,1,anlagevermögen sonstige vermögensgegenstände k...,,json_table_1,0,
2,2,eigenkapital gezeichnetes kapital ergebnistvor...,,json_table_2,0,
3,3,name world gmbh vormals bhk zwischenhandelsges...,,json_table_0,1,
4,4,anlagevermögen sachanlagen umlaufvermögen ford...,,json_table_1,1,


In [13]:
# remove english tables
print("entries before:"+ str(len(df)))
j=0
for i in range(len(df)):
    if detect(df.content[i])=='en':
        j=j+1       
        df=df.drop(index=i)
print("entries after:"+ str(len(df)))
df=df.reset_index()

entries before:4091
entries after:3898


In [14]:
# stemm the content and save them in content_stemmed
stemmer = SnowballStemmer("german")
for i in range(len(df)):
    for item in nltk.word_tokenize(df.content[i]):
        df.content_stemmed[i]=str(df.content_stemmed[i])+' '+ stemmer.stem(item)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,level_0,index,content,content_stemmed,table_id,json,type
0,0,0,name gmbhweling,nam gmbhweling,json_table_0,0,
1,1,1,anlagevermögen sonstige vermögensgegenstände k...,anlagevermog sonstig vermogensgegenstand kas...,json_table_1,0,
2,2,2,eigenkapital gezeichnetes kapital ergebnistvor...,eigenkapital gezeichnet kapital ergebnistvor...,json_table_2,0,
3,3,3,name world gmbh vormals bhk zwischenhandelsges...,nam world gmbh vormal bhk zwischenhandelsges...,json_table_0,1,
4,4,4,anlagevermögen sachanlagen umlaufvermögen ford...,anlagevermog sachanlag umlaufvermog forder u...,json_table_1,1,


In [15]:
# save dataframe in csv with the name tables_preprocessed
df.to_csv('tables_preprocessed')

In [16]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 40.7199969291687 seconds ---
