In [15]:
from collections import Counter

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


## Dialog Acts

In [3]:
df = pd.read_csv('data/dialog_acts.dat', names=['sentence'])

df['target'] = df['sentence'].apply(lambda x: x.split()[0].lower())
df['sentence'] = df['sentence'].apply(lambda x: x.split(' ', 1)[1].lower())

df_deduplicated = df.drop_duplicates(subset=['sentence'], keep='first')

df


Unnamed: 0,sentence,target
0,im looking for a moderately priced restaurant ...,inform
1,any part of town,inform
2,bistro food,inform
3,is there a moderately priced restaurant that s...,confirm
4,yes,affirm
...,...,...
25496,what is the price range,request
25497,thank you good bye,thankyou
25498,im looking for african food,inform
25499,whats the address and phone number,request


### Distributions


In [4]:
target_counts = df['target'].value_counts()
target_counts_percentage = round(df['target'].value_counts(normalize=True) * 100, 2)

deduplicated_target_counts = df_deduplicated['target'].value_counts()
deduplicated_target_counts_percentage = round(df_deduplicated['target'].value_counts(normalize=True) * 100, 2)

distribution_df = pd.DataFrame({
    'targets': target_counts.index,
    'distribution of targets in number': target_counts.values,
    'distribution of targets in %': target_counts_percentage.values,
    'deduplicated distribution in number': deduplicated_target_counts.reindex(target_counts.index, fill_value=0).values,
    'deduplicated distribution in %': deduplicated_target_counts_percentage.reindex(target_counts.index, fill_value=0).values
})

distribution_df


Unnamed: 0,targets,distribution of targets in number,distribution of targets in %,deduplicated distribution in number,deduplicated distribution in %
0,inform,10160,39.84,3062,57.14
1,request,6494,25.47,961,17.93
2,thankyou,3259,12.78,81,1.51
3,reqalts,1747,6.85,437,8.15
4,,1612,6.32,249,4.65
5,affirm,1156,4.53,166,3.1
6,negate,435,1.71,133,2.48
7,bye,266,1.04,41,0.77
8,confirm,172,0.67,115,2.15
9,hello,93,0.36,60,1.12


### Sentence lengths


In [8]:
df['words'] = df['sentence'].apply(lambda x: len(x.split(' ')))
df['length'] = df['sentence'].apply(lambda x: len(x))

df_deduplicated = df_deduplicated.copy()
df_deduplicated['words'] = df_deduplicated['sentence'].apply(lambda x: len(x.split(' ')))
df_deduplicated['length'] = df_deduplicated['sentence'].apply(lambda x: len(x))


length_df = pd.DataFrame({
    'number of words': round(df.groupby('target')['words'].mean(), 2),
    'number of characters': round(df.groupby('target')['length'].mean(), 2),
    'deduplicated number of words': round(df_deduplicated.groupby('target')['words'].mean(), 2),
    'deduplicated number of characters': round(df_deduplicated.groupby('target')['length'].mean(), 2)
})

length_df


Unnamed: 0_level_0,number of words,number of characters,deduplicated number of words,deduplicated number of characters
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ack,3.29,14.46,3.9,17.85
affirm,2.0,8.52,7.46,37.33
bye,2.68,11.89,4.41,21.02
confirm,5.5,26.82,5.69,28.32
deny,2.89,14.63,4.77,24.31
hello,6.15,31.7,8.55,43.77
inform,4.11,22.26,7.19,39.13
negate,2.26,9.15,4.52,21.86
,1.26,7.29,2.29,10.4
repeat,2.42,11.97,2.92,14.62


### Most common words

In [14]:
df['words_list'] = df['sentence'].apply(lambda x: x.split())
df_deduplicated['words_list'] = df_deduplicated['sentence'].apply(lambda x: x.split())

def find_most_common_word(group):
    word_counts = Counter([word for sentence in group['words_list'] for word in set(sentence)])
    most_common_word, count = word_counts.most_common(1)[0]
    return most_common_word, count / len(group) * 100

popular_words_duplicated = df.groupby('target').apply(find_most_common_word).apply(pd.Series)
popular_words_duplicated.columns = ['most_common_word', 'percentage_duplicated']

popular_words_deduplicated = df_deduplicated.groupby('target').apply(find_most_common_word).apply(pd.Series)
popular_words_deduplicated.columns = ['most_common_word', 'percentage_deduplicated']

popular_words_df = pd.DataFrame({
    'most_common_word_duplicated': popular_words_duplicated['most_common_word'],
    'percentage_duplicated': round(popular_words_duplicated['percentage_duplicated'], 2),
    'most_common_word_deduplicated': popular_words_deduplicated['most_common_word'],
    'percentage_deduplicated': round(popular_words_deduplicated['percentage_deduplicated'], 2),
})

popular_words_df


Unnamed: 0_level_0,most_common_word_duplicated,percentage_duplicated,most_common_word_deduplicated,percentage_deduplicated
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ack,okay,57.14,okay,50.0
affirm,yes,81.83,yes,75.9
bye,bye,86.47,bye,75.61
confirm,it,73.84,it,63.48
deny,wrong,51.85,want,53.85
hello,hi,52.69,hi,73.33
inform,food,29.39,restaurant,52.25
negate,no,99.77,no,99.25
,noise,25.5,the,12.85
repeat,repeat,60.61,repeat,53.85
