# Skillset preparation

In [2]:
# Imports
from PIL import Image
import pandas as pd
import numpy as np
import os,sys

In [34]:
# Read raw files
raw_skillfile_path = "./raw_data/kutsekoda/skillfiles"
skillfiles = os.listdir(raw_skillfile_path)

# Turn into long format with only existing values
long_raw_dfs = []
for f in skillfiles:
    raw_df = pd.read_csv(os.path.join(raw_skillfile_path, f))

    # query as index for long-formatting
    raw_df.set_index('text', inplace=True)

    # Long format
    long_df = raw_df.melt(
        value_vars = raw_df.columns.to_list(),
        var_name = 'skill_nr',
        value_name = 'skill',
        ignore_index= False        
        )
    long_df.pop('skill_nr')

    # Drop all missing values
    long_df.dropna(axis=0, inplace=True)

    # Reset index for easier concat later
    long_df.reset_index(inplace=True)
    
    # Add df to list for later concatenation
    long_raw_dfs.append(long_df)

long_df = pd.concat(long_raw_dfs, ignore_index=True)

# Generate query identifiers
long_df['text_id'] = [f'text{i}' for i in range(long_df.shape[0])]

# query as index for long-formatting
long_df.set_index('text_id', inplace=True)

# Remove original queries and save into file
unique_texts = long_df.pop("text")
unique_texts.name = "text"
text_df = unique_texts.to_frame()
text_df.to_csv('./prepped_data/kutsekoda/texts_about_skills.csv')

long_df



Unnamed: 0_level_0,skill
text_id,Unnamed: 1_level_1
text0,2.7.9
text1,2.7.9
text2,2.11.5
text3,2.11.5
text4,2.7.2
...,...
text636,1.1.4
text637,1.1.9
text638,1.1.9
text639,1.2.7


In [35]:
# The count of skills
skill_count = long_df.groupby(by='skill').size().sort_values(ascending=False)
skill_count.to_frame()

Unnamed: 0_level_0,0
skill,Unnamed: 1_level_1
1.3.9,70
1.2.1,35
2.11.5,27
1.1.8,26
1.3.2,23
...,...
2.3.13,1
2.2.7,1
2.1.1,1
1.1.7,1


In [51]:
# Keep only skills with at least 4 instances
skills_to_keep = skill_count[skill_count >= 4]


# Add names to skills
skill_dict = {f"s{i}":obj for i,obj in enumerate(skills_to_keep.keys())}
skill_df = pd.DataFrame({
    'skill_id': skill_dict.keys(),
    'skill': skill_dict.values()
})

# Read and format skill descriptions
skill_descriptions_df = pd.read_csv('./raw_data/kutsekoda/oskused.csv')
col_name_map = {"Kood": "skill", "Oskus": "skill_name", "Lühikirjeldus": "skill_description"}
skill_descriptions_df.rename(columns=col_name_map, inplace=True)
skill_descriptions_df = skill_descriptions_df[list(col_name_map.values())]

# Merge skillset to keep with skill descriptions
skill_df = skill_df.merge(skill_descriptions_df, on='skill')
skill_df

Unnamed: 0,skill_id,skill,skill_name,skill_description
0,s0,1.3.9,Digitaalne kirjaoskus,Oskus mõista ja kasutada digitaalseid süsteeme...
1,s1,1.2.1,Analüüsioskus,Oskus dekonstrueerida infot väiksemateks üksus...
2,s2,2.11.5,Seadmete kasutamine ja käitamine,Oskus töövahendeid kasutada. Võime rakendada t...
3,s3,1.1.8,Juhistest ja nõuetest lähtumine,Oskus töö tegemiseks vajalikke juhiseid ja ree...
4,s4,1.3.2,Meeskonnatöö- ja koostööoskus,Oskus kollektiivi vajadustega ning ühise eesmä...
5,s5,2.3.4,Meeskonna juhtimine,Võime planeerida ja korraldada meeskonna ülesa...
6,s6,2.11.4,"Seadistamine, kasutuseks valmis seadmine ja ho...","Võime seadistada masinaid või seadmeid, asenda..."
7,s7,2.3.2,"Kavade, strateegiate ja plaanide väljatöötamine",Tulevikuolukorra ettekujutamine ning selle saa...
8,s8,1.3.3,Suhtlemisoskus,Oskus luua teise inimesega füüsiline ja psühho...
9,s9,1.3.8,Märgisüsteemide kasutamine ja/või koostamine,Oskus edastada ja mõista ideid ja infot märgil...


In [52]:
# Long dataframe that includes only skills that have several instances
rows_to_keep = [l in skills_to_keep for l in long_df.skill]
keep_df = long_df[rows_to_keep]

# Reset index
keep_df.reset_index(inplace=True)

# Add label IDs
keep_df = keep_df.merge(skill_df, on='skill')

# Remove old skill ids
keep_df

Unnamed: 0,text_id,skill,skill_id,skill_name,skill_description
0,text0,2.7.9,s13,Detailide kokkupanemine,Erinevatest materjalidest detailide liitmine m...
1,text1,2.7.9,s13,Detailide kokkupanemine,Erinevatest materjalidest detailide liitmine m...
2,text438,2.7.9,s13,Detailide kokkupanemine,Erinevatest materjalidest detailide liitmine m...
3,text439,2.7.9,s13,Detailide kokkupanemine,Erinevatest materjalidest detailide liitmine m...
4,text440,2.7.9,s13,Detailide kokkupanemine,Erinevatest materjalidest detailide liitmine m...
...,...,...,...,...,...
560,text468,2.8.11,s36,Juhendmaterjali väljatöötamine,"Dokumentide, detailsete juhiste, jooniste või ..."
561,text469,2.8.11,s36,Juhendmaterjali väljatöötamine,"Dokumentide, detailsete juhiste, jooniste või ..."
562,text470,2.8.11,s36,Juhendmaterjali väljatöötamine,"Dokumentide, detailsete juhiste, jooniste või ..."
563,text471,2.8.11,s36,Juhendmaterjali väljatöötamine,"Dokumentide, detailsete juhiste, jooniste või ..."


# Format data for competition

In [53]:
# Wide format first for easier gathering into space separated list
# Add 1s for values
keep_df = keep_df.assign(value = 1)

# Turn it into wide dataframe
keep_df_wide = keep_df.pivot_table(index='text_id', columns='skill_id', values='value', fill_value=0)
keep_df_wide

skill_id,s0,s1,s10,s11,s12,s13,s15,s16,s17,s18,...,s46,s47,s48,s49,s5,s50,s6,s7,s8,s9
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
text0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
text1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
text10,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
text100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
text101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
text95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
text96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
text97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
text98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# Gather intents into comma separated list
# Gather into a dictionary
obj_dict = {}
for i, row in keep_df_wide.iterrows():
    skills = keep_df_wide.columns
    mystring = ""
    for elem, lab in enumerate(row):
        if lab:
            mystring += " " + str(skills[elem])
    mystring = mystring.strip()
    obj_dict[i] = mystring

# Into dataframe
gathered_df = pd.DataFrame({
    'text_id': obj_dict.keys(),
    'skills': obj_dict.values()
    },
    )

gathered_df

Unnamed: 0,text_id,skills
0,text0,s13
1,text1,s13
2,text10,s0
3,text100,s26
4,text101,s42
...,...,...
560,text95,s5
561,text96,s26
562,text97,s26
563,text98,s26


# Data into test and train set

In [55]:
# Select 70% of data for training
train_df = gathered_df.sample(n = int(gathered_df.shape[0] * 0.7))

# Test data & solution
test_df = gathered_df.loc[~gathered_df.index.isin(train_df.index)]

solution_df = test_df.copy(deep=True)
test_df.pop('skills')

# Sample submission
result_vals = train_df['skills'].to_list()
sample_submission_df = solution_df.copy(deep=True)
sample_submission_df['skills'] = np.random.choice(result_vals, solution_df.shape[0])

In [56]:
# Check that sample objects doesn't match other objects
sample_submission_df['skills'] == solution_df['skills']

1      False
2      False
6      False
12     False
21     False
       ...  
547    False
549    False
550    False
562    False
563    False
Name: skills, Length: 170, dtype: bool

In [45]:
# Write data
result_path = "./prepped_data/kutsekoda"

train_df.to_csv(os.path.join(result_path, 'train.csv'), index=False)
test_df.to_csv(os.path.join(result_path, 'test.csv'), index=False)
solution_df.to_csv(os.path.join(result_path, 'solution.csv'), index=False)
sample_submission_df.to_csv(os.path.join(result_path, 'sample_submission.csv'), index=False)

# Labels
skill_df.pop('skill')
skill_df.to_csv(os.path.join(result_path, 'skills.csv'), index=False)