# Notebook that prepares multi-intent detection dataset for competition

In [31]:
# Imports
from PIL import Image
import pandas as pd
import numpy as np
import os,sys

# Data wrangling

In [57]:
# Read the data, lowercase and replace NAs
raw_df = pd.read_csv('./raw_data/multi-intent/mitmikkavatsus_andmed.csv')

# Generate query identifiers
raw_df['query_id'] = [f'query{i}' for i in range(raw_df.shape[0])]

# query as index for long-formatting
raw_df.set_index('query_id', inplace=True)

# Remove original queries and save into file
unique_queries = raw_df.pop("Query")
unique_queries.name = "query"
query_df = unique_queries.to_frame()
query_df.to_csv('./prepped_data/multi-intent/customer_queries.csv')

# Some data structuring and cleaning
for c in raw_df.columns:
    # lowercase all values
    raw_df[c] = raw_df[c].str.lower()
    # remove leading and trailin commas
    raw_df[c] = raw_df[c].str.strip()
    # replace underscore with spaces
    raw_df[c] = raw_df[c].str.replace("_", " ")

# Highlight missing values as 'missing'
raw_df.fillna('missing', inplace=True)

raw_df

Unnamed: 0_level_0,Intent_1,Intent_2,Intent_3,Intent_4
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
query0,külastasin asutust,arve tasumine,probleemist informeerimine,missing
query1,hommikusöögi tellimine,probleemist informeerimine,kaebuse esitamine,missing
query2,müra,liikluseksami tulemuse kestvus,missing,missing
query3,avalduse esitamine,müra,probleemist informeerimine,missing
query4,probleemist informeerimine,avalduse esitamine,müra,missing
...,...,...,...,...
query245,id-kaardi uuendamine,id-kaardi väljastamine,missing,missing
query246,id-kaardi uuendamine,id-kaardi väljastamine,missing,missing
query247,id-kaardi uuendamine,id-kaardi väljastamine,missing,missing
query248,tervitus,id-kaardi uuendamine,id-kaardi väljastamine,missing


## Keep intents with multiple instances

In [40]:
# Long format
long_df = raw_df.melt(
    value_vars = raw_df.columns.to_list(),
    var_name = 'intent_col',
    value_name = 'intent',
    ignore_index= False        
    )
long_df.pop('intent_col')
long_df

Unnamed: 0_level_0,intent
query_id,Unnamed: 1_level_1
query0,külastasin asutust
query1,hommikusöögi tellimine
query2,müra
query3,avalduse esitamine
query4,probleemist informeerimine
...,...
query245,missing
query246,missing
query247,missing
query248,missing


In [41]:
# The count of labels
intent_count = long_df.groupby(by='intent').size().sort_values(ascending=False)
intent_count.to_frame()

Unnamed: 0_level_0,0
intent,Unnamed: 1_level_1
missing,376
tervitus,152
hüvasti jätmine,52
teaviku kaotamine rikkumine,44
covid küsimused,27
...,...
kelle loodud,1
andmejälgija,1
liikluseksami tulemuse kestvus,1
külastasin asutust,1


In [48]:
# Keep only labels with at least 4 instances
intents_to_keep = intent_count[intent_count >= 3]

# Remove the label 'missing'
intents_to_keep.pop('missing')

# Add names to labels
intent_dict = {f"i{i}":obj for i,obj in enumerate(intents_to_keep.keys())}
intent_df = pd.DataFrame({
    'intent_id': intent_dict.keys(),
    'intent': intent_dict.values()
})
intent_df

Unnamed: 0,intent_id,intent
0,i0,tervitus
1,i1,hüvasti jätmine
2,i2,teaviku kaotamine rikkumine
3,i3,covid küsimused
4,i4,kiirpass
5,i5,raamatulaenutus
6,i6,id-kaardi uuendamine
7,i7,infonõustamine
8,i8,teavikute laenutähtaeg pikendamine
9,i9,dokumendi taotlemine


In [49]:
# Long dataframe that includes only labels that have several instances
rows_to_keep = [l in intents_to_keep for l in long_df.intent]
keep_df = long_df[rows_to_keep]

In [50]:
# Reset index
keep_df.reset_index(inplace=True)

# Add label IDs
keep_df = keep_df.merge(intent_df, on='intent')
keep_df

Unnamed: 0,query_id,intent,intent_id
0,query2,müra,i22
1,query6,müra,i22
2,query8,müra,i22
3,query3,müra,i22
4,query72,müra,i22
...,...,...,...
585,query212,luhiajalise tootamise viisa loppemine,i15
586,query213,luhiajalise tootamise viisa loppemine,i15
587,query214,luhiajalise tootamise viisa loppemine,i15
588,query215,luhiajalise tootamise viisa loppemine,i15


# Format the data for competition

In [51]:
# Wide format first for easier gathering into space separated list
# Add 1s for values
keep_df = keep_df.assign(value = 1)

# Turn it into wide dataframe
keep_df_wide = keep_df.pivot_table(index='query_id', columns='intent_id', values='value', fill_value=0)
keep_df_wide

intent_id,i0,i1,i10,i11,i12,i13,i14,i15,i16,i17,...,i27,i28,i29,i3,i4,i5,i6,i7,i8,i9
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
query0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
query1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
query10,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
query100,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
query101,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
query95,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
query96,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
query97,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
query98,1,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Gather intents into comma separated list
# Gather into a dictionary
obj_dict = {}
for i, row in keep_df_wide.iterrows():
    intents = keep_df_wide.columns
    mystring = ""
    for elem, lab in enumerate(row):
        if lab:
            mystring += " " + str(intents[elem])
    mystring = mystring.strip()
    obj_dict[i] = mystring

# Into dataframe
gathered_df = pd.DataFrame({
    'query_id': obj_dict.keys(),
    'intents': obj_dict.values()
    },
    )

gathered_df

Unnamed: 0,query_id,intents
0,query0,i13
1,query1,i13
2,query10,i0 i3
3,query100,i1 i16 i17
4,query101,i0 i1 i17
...,...,...
245,query95,i27 i3
246,query96,i0 i22
247,query97,i0 i26
248,query98,i0 i16 i17


# Data into test and train data

In [54]:
# Select 70% of data for training
train_df = gathered_df.sample(n = int(gathered_df.shape[0] * 0.7))

# Test data & solution
test_df = gathered_df.loc[~gathered_df.index.isin(train_df.index)]

solution_df = test_df.copy(deep=True)
test_df.pop('intents')

# Sample submission
result_vals = train_df['intents'].to_list()
sample_submission_df = solution_df.copy(deep=True)
sample_submission_df['intents'] = np.random.choice(result_vals, solution_df.shape[0])

In [56]:
# Check that sample objects doesn't match other objects
sample_submission_df['intents'] == solution_df['intents']

1      False
6      False
7      False
10     False
13     False
       ...  
234    False
238    False
239    False
243    False
244    False
Name: intents, Length: 75, dtype: bool

In [58]:
# Write data
result_path = "./prepped_data/multi-intent"

train_df.to_csv(os.path.join(result_path, 'train.csv'), index=False)
test_df.to_csv(os.path.join(result_path, 'test.csv'), index=False)
solution_df.to_csv(os.path.join(result_path, 'solution.csv'), index=False)
sample_submission_df.to_csv(os.path.join(result_path, 'sample_submission.csv'), index=False)

# Labels
intent_df.to_csv(os.path.join(result_path, 'intents.csv'), index=False)