# Notebook that prepares the musealia data for Andmetalgud

## Summary
You have the data about the objects in Estonian museums (musealia). Each row in the dataset represents a single object that is uniquely identified by the 'id' (the first column). Each column represents one variable of the object (e.g. name, material, location). 
Most of the variables are categorical (e.g. material, color, damages.) but there are also textual variables (e.g. initial_info, additional_text, ...). In addition, there is one numerical variable (element_count) and one binary variable (is_original). All variables are in Estonian language.

Currently, many museum objects have missing info about their "type" (whether the object is photo, document, book, letter, etc.). Your challenge is to predict the "type" of the object based on other variables. 

In [1]:
# Import tools
import os, sys
import pandas as pd
import numpy as np

# Musealia

In [2]:
# List files
mus_dir = "./raw_data/musealia"
mus_files = os.listdir(mus_dir)

In [3]:
# Read all files
file_frames = []
existing_cols = []
for i in range(len(mus_files)):
    filepath = os.path.join(mus_dir, mus_files[i])
    df = pd.read_csv(filepath)
    df.set_index("MUSEAAL_ID", inplace=True)

    # Remove duplicated rows
    df = df[~df.index.duplicated(keep='first')]

    # Remove columns that already exist in other tables
    colnames = df.columns
    dup_cols = [col in existing_cols for col in colnames]
    # Add the novel columns from this table to colnames
    new_cols =  [col for col in colnames if col not in existing_cols]
    [existing_cols.append(col) for col in new_cols]
    df = df.loc[:,new_cols]
    file_frames.append(df)

  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


In [4]:
# Merge all files into a single table
musealia_df = pd.concat(file_frames, axis=1)

# Reindex for better saving
musealia_df.reset_index(inplace=True)

## Check validity

In [5]:
dup_museals = sum(musealia_df.index.duplicated())
print("Number of duplicates in data:", dup_museals)

# Does the number of columns match
col_diff = sum(df.shape[1] for df in file_frames) - musealia_df.shape[1]
print("Difference between the number of columns:", col_diff)

Number of duplicates in data: 0
Difference between the number of columns: -1


In [6]:
# Number of na's per columns
musealia_df.isna().sum()

MUSEAAL_ID              0
TAIS_NR            261583
NIMETUS            261577
KS                 275326
MATERJAL           261577
KOMMENTAAR         580444
SYNDMUSE_LIIK      259356
ASUKOHT            555086
ALGUS              384529
LOPP               548191
ENNE_KR            402723
RIIK_ADMIN_KOND    259356
OSALEJA_ROLL       376469
OSALEJA            376469
KIHELKOND          597136
TEKST              589552
LIIK               589552
TEHNIKA            392788
PARAMEETER         341236
YHIK               341236
VAARTUS            341241
ACR                  5350
TRT                150409
TRS                164300
TRJ                455572
TRL                578009
KT                   5380
KJ                 200301
KL                 298037
ELEMENTIDE_ARV       5256
TULMELEGEND        440444
ON_ORIGINAAL       105968
ESMANE_YLDINFO     482448
KAHJUSTUSED        549771
SEISUND              5256
OLEMUS             103723
VARV               547155
LISATEKST          458531
dtype: int64

### Rename columns

In [7]:
col_map = {
    'MUSEAAL_ID': 'id',
    'TAIS_NR': 'full_nr', 
    'NIMETUS': 'name', 
    'KS': 'ks', 
    'MATERJAL': 'material', 
    'KOMMENTAAR': 'commentary', 
    'SYNDMUSE_LIIK': 'event_type',
    'ASUKOHT': 'location', 
    'ALGUS': 'start', 
    'LOPP': 'end', 
    'ENNE_KR': 'before_Christ', 
    'RIIK_ADMIN_KOND': 'country_and_unit',
    'OSALEJA_ROLL': 'participants_role', 
    'OSALEJA': 'participant', 
    'KIHELKOND': 'parish', 
    'TEKST': 'text', 
    'LIIK': 'class', 
    'TEHNIKA': 'technique',
    'PARAMEETER': 'parameter', 
    'YHIK': 'unit', 
    'VAARTUS': 'value', 
    'ACR': 'museum_abbr', 
    'TRT': 'musealia_mark', 
    'TRS': 'musealia_seria_nr', 
    'TRJ': 'musealia_queue_nr', 
    'TRL': 'musealia_additional_nr',
    'KT': 'collection_mark', 
    'KJ': 'collection_queue_nr', 
    'KL': 'collection_additional_nr', 
    'ELEMENTIDE_ARV': 'element_count', 
    'TULMELEGEND': 'legend', 
    'ON_ORIGINAAL': 'is_original',
    'ESMANE_YLDINFO': 'initial_info', 
    'KAHJUSTUSED': 'damages', 
    'SEISUND': 'state', 
    'OLEMUS': 'type', 
    'VARV': 'color',
    'LISATEKST': 'additional_text'
}

musealia_df.rename(columns=col_map, inplace=True)
musealia_df.columns

Index(['id', 'full_nr', 'name', 'ks', 'material', 'commentary', 'event_type',
       'location', 'start', 'end', 'before_Christ', 'country_and_unit',
       'participants_role', 'participant', 'parish', 'text', 'class',
       'technique', 'parameter', 'unit', 'value', 'museum_abbr',
       'musealia_mark', 'musealia_seria_nr', 'musealia_queue_nr',
       'musealia_additional_nr', 'collection_mark', 'collection_queue_nr',
       'collection_additional_nr', 'element_count', 'legend', 'is_original',
       'initial_info', 'damages', 'state', 'type', 'color', 'additional_text'],
      dtype='object')

## Reduce the number of types
Top 60 types are selected and some near-duplications are removed manually.

In [8]:
all_types_ordered = musealia_df.groupby(by='type').size().sort_values(ascending=False)
top_types = all_types_ordered[:60]

selected_types = ['foto', 'fotonegatiiv', 'kavand/joonis/eskiis', 'plakat', 'kava',
       'arheoloogiline leid', 'kiri', 'raamat', 'käsikiri', 'dokument',
       'noodid', 'graafika', 'pitser/templijäljend', 'münt',
       'käsikiri, laul/ vokaalmuusika', 'digitaalne kujutis', 'postkaart',
       'väiketrükis', 'ajakiri', 'ajaleht', 'medal', 'kutse', 'telegramm',
       'helisalvestis', 'käsikiri, muusikateos', 'diapositiiv', 'silt/märk', 'joonistus',
       'foto, postkaart', 'kalender', 'karikatuur',
       'fotomaterjal', 'aukiri/auaadress', 'nõu/anum', 'pitsat', 'trükinoot',
       'muusikainstrument', 'album', 'paber', 'maal',
       'fotonegatiiv, fotonegatiiv', 'kleit', 'skulptuur', 'kott',
       'kaustik/vihik', 'lint/pael', 'märkmed', 'nukk',
       'kiri, postkaart', 'lina/linik', 'ehe', 'laegas/karp',
       'käsikiri, noodid', 'pakend', 'tunnistus']

#top_types.keys()

selected_types_df = musealia_df.loc[[t in selected_types for t in musealia_df.type]]

## Select data for competition

In [9]:
# Keep rows that have the type existing
selected_df = selected_types_df[~selected_types_df.type.isna()]
selected_df = selected_df.sample(20000)

# Put type as the last column of the df
type = selected_df.pop('type')
selected_df['type'] = type

In [10]:
selected_df.groupby(by='type').size().sort_values(ascending=False)

type
foto                             5352
fotonegatiiv                     3140
kavand/joonis/eskiis             1216
arheoloogiline leid              1118
plakat                           1095
kava                             1065
kiri                              908
raamat                            721
käsikiri                          682
dokument                          648
graafika                          338
noodid                            306
pitser/templijäljend              286
münt                              285
digitaalne kujutis                271
käsikiri, laul/ vokaalmuusika     268
postkaart                         211
väiketrükis                       161
ajakiri                           154
ajaleht                           143
kutse                             140
medal                             137
telegramm                         104
helisalvestis                      91
diapositiiv                        86
käsikiri, muusikateos              78
foto, p

## Train-test split

In [11]:
# Select 70% of data for training
train_df = selected_df.sample(n = int(selected_df.shape[0] * 0.7))

# Test data & solution
test_df = selected_df.loc[~selected_df.index.isin(train_df.index)]

solution_df = test_df[['id','type']]
test_df.pop('type')

# Sample submission
result_vals = train_df.type.unique()
sample_submission_df = solution_df.copy(deep=True)
sample_submission_df['type'] = np.random.choice(result_vals, solution_df.shape[0])



In [18]:
# https://www.kaggle.com/community-competitions-setup-guide 
# Write data
result_path = "./prepped_data/musealia"

train_df.to_csv(os.path.join(result_path, 'train.csv'), index=False)
test_df.to_csv(os.path.join(result_path, 'test.csv'), index=False)
solution_df.to_csv(os.path.join(result_path, 'solution.csv'), index=False)
sample_submission_df.to_csv(os.path.join(result_path, 'sample_submission.csv'), index=False)

# Unique types to predict
unique_types = pd.DataFrame({'type': selected_types})
unique_types.to_csv(os.path.join(result_path, 'unique_types.csv'), index=False)