# Notebook that prepares the data for Andmetalgud

In [86]:
# Import tools
import os, sys
import pandas as pd
import numpy as np

# Musealia

In [6]:
# List files
mus_dir = "./raw_data/musealia"
mus_files = os.listdir(mus_dir)

In [27]:
# Read all files
file_frames = []
existing_cols = []
for i in range(len(mus_files)):
    filepath = os.path.join(mus_dir, mus_files[i])
    df = pd.read_csv(filepath)
    df.set_index("MUSEAAL_ID", inplace=True)

    # Remove duplicated rows
    df = df[~df.index.duplicated(keep='first')]

    # Remove columns that already exist in other tables
    colnames = df.columns
    dup_cols = [col in existing_cols for col in colnames]
    # Add the novel columns from this table to colnames
    new_cols =  [col for col in colnames if col not in existing_cols]
    [existing_cols.append(col) for col in new_cols]
    df = df.loc[:,new_cols]
    file_frames.append(df)

  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


In [28]:
# Merge all files into a single table
musealia_df = pd.concat(file_frames, axis=1)

## Check validity

In [38]:
dup_museals = sum(musealia_df.index.duplicated())
print("Number of duplicates in data:", dup_museals)

# Does the number of columns match
col_diff = sum(df.shape[1] for df in file_frames) - musealia_df.shape[1]
print("Difference between the number of columns:", col_diff)

Number of duplicates in data: 0
Difference between the number of columns: 0


In [46]:
# Number of na's per columns
musealia_df.isna().sum()

TAIS_NR            261583
NIMETUS            261577
KS                 275326
MATERJAL           261577
KOMMENTAAR         580444
SYNDMUSE_LIIK      259356
ASUKOHT            555086
ALGUS              384529
LOPP               548191
ENNE_KR            402723
RIIK_ADMIN_KOND    259356
OSALEJA_ROLL       376469
OSALEJA            376469
KIHELKOND          597136
TEKST              589552
LIIK               589552
TEHNIKA            392788
PARAMEETER         341236
YHIK               341236
VAARTUS            341241
ACR                  5350
TRT                150409
TRS                164300
TRJ                455572
TRL                578009
KT                   5380
KJ                 200301
KL                 298037
ELEMENTIDE_ARV       5256
TULMELEGEND        440444
ON_ORIGINAAL       105968
ESMANE_YLDINFO     482448
KAHJUSTUSED        549771
SEISUND              5256
OLEMUS             103723
VARV               547155
LISATEKST          458531
dtype: int64

## Select data for competition

In [96]:
# Keep rows that have the OLEMUS existing
selected_df = musealia_df[~musealia_df.OLEMUS.isna()]
selected_df = selected_df.sample(10000)

# Put OLEMUS as the last column of the df
olemus = selected_df.pop('OLEMUS')
selected_df['OLEMUS'] = olemus

# Reindex for better saving
selected_df.reset_index(inplace=True)

## Train-test split

In [106]:
# Select 70% of data for training
train_df = selected_df.sample(n = int(selected_df.shape[0] * 0.7))

# Test data & solution
test_df = selected_df.loc[~selected_df.index.isin(train_df.index)]

solution_df = test_df[['MUSEAAL_ID','OLEMUS']]
test_df.pop('OLEMUS')

# Sample submission
result_vals = train_df.OLEMUS.unique()
sample_submission_df = solution_df.copy(deep=True)
sample_submission_df['OLEMUS'] = np.random.choice(result_vals, solution_df.shape[0])



In [110]:
# https://www.kaggle.com/community-competitions-setup-guide 
# Write data
result_path = "./prepped_data/musealia"

train_df.to_csv(os.path.join(result_path, 'train.csv'), index=False)
test_df.to_csv(os.path.join(result_path, 'test.csv'), index=False)
solution_df.to_csv(os.path.join(result_path, 'solution.csv'), index=False)
sample_submission_df.to_csv(os.path.join(result_path, 'sample_submission.csv'), index=False)