In [7]:
# https://www.nltk.org/book/ch06.html

import nltk
from nltk.corpus import nps_chat

In [11]:
# download nltk data

nltk.download('punkt')
nltk.download('nps_chat')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package nps_chat to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


True

In [8]:
# define a feature extractor (post words) 
def dialogue_act_features(post):
    features = {}
    
    for word in nltk.word_tokenize(post):
        features[f'contains({word})'] = True

    return features

In [32]:
# import nps_chat posts and create a dev-test dataset

nps_posts = nps_chat.xml_posts()[:10000]

feature_dataset = [(dialogue_act_features(post.text), post.get('class')) for post in nps_posts]

# 80% train, 20% test
test_size = int(len(feature_dataset) * 0.2)

train_data, test_data = feature_dataset[test_size:], feature_dataset[:test_size]

In [33]:
# create a classifier and test it

classifier = nltk.NaiveBayesClassifier.train(train_data)

print(nltk.classify.accuracy(classifier, test_data))

0.6445


In [34]:
import pandas as pd

In [67]:
# import tob-stt v1 logs
v1logs = pd.read_csv("./tob_v1_dataset")

In [66]:
# classifier function for tob logs

def classify_question(question):
    return classifier.classify(dialogue_act_features(question))

In [98]:
# classify questions

classified_logs = v1logs.copy()

classified_logs['question_class'] = classified_logs['question'].apply(classify_question)

In [71]:
classified_logs

Unnamed: 0,user,question,answer,class
0,100,hi,"Hello, how are you?",Greet
1,100,Simone,"Sorry, I still can't answer that.. Could you a...",Statement
2,100,I am Simone,"Sorry, I still can't answer that.. Could you a...",Statement
3,100,what is software testing,Software testing is an activity of software en...,whQuestion
4,100,what is structural testing,Structural testing (or White-Box testing) is a...,whQuestion
...,...,...,...,...
2984,98,what is the definition of state transition tes...,"It is a criteria within functional testing, bu...",whQuestion
2985,98,what is graph testing,Software testing is an activity of software en...,whQuestion
2986,98,what is graph matrix testing,"It is a criteria within functional testing, bu...",whQuestion
2987,99,Hi,"Hello, ask me a question!",Greet


In [99]:
classified_logs['question_class'].unique()

array(['Greet', 'Statement', 'whQuestion', 'Clarify', 'nAnswer', 'Bye',
       'ynQuestion', 'Emphasis', 'yAnswer', 'Reject', 'System', 'Emotion',
       'Continuer', 'Other', 'Accept'], dtype=object)

In [100]:
questions = classified_logs[classified_logs['question_class'] == 'whQuestion']

questions

Unnamed: 0,user,question,answer,question_class
3,100,what is software testing,Software testing is an activity of software en...,whQuestion
4,100,what is structural testing,Structural testing (or White-Box testing) is a...,whQuestion
5,100,what is a software technique,"Sorry, I still can't answer that.. Could you a...",whQuestion
6,100,what is system testing,System testing is a level of software testing ...,whQuestion
7,100,what is unit testing,Unit testing verifies the functioning in isola...,whQuestion
...,...,...,...,...
2982,98,what is state transition,"It is a criteria within functional testing, bu...",whQuestion
2983,98,what is state transition testing,"It is a criteria within functional testing, bu...",whQuestion
2984,98,what is the definition of state transition tes...,"It is a criteria within functional testing, bu...",whQuestion
2985,98,what is graph testing,Software testing is an activity of software en...,whQuestion


In [101]:
statements = classified_logs[classified_logs['question_class'] == 'Statement']



In [102]:
classified_logs[classified_logs['question_class'] == 'yAnswer']

Unnamed: 0,user,question,answer,question_class
99,105,yes,"Sorry, I still can't answer that.. Could you a...",yAnswer
206,110,Yes,"Sorry, I still can't answer that.. Could you a...",yAnswer
550,130,Yes,"Sorry, I still can't answer that.. Could you a...",yAnswer
695,143,Wkato is stranger information,"Sorry, I still can't answer that.. Could you a...",yAnswer
1110,154,ou can give the example of information error,An example of an error could be the access of ...,yAnswer
1122,154,"What is the difference between failure, error,...",A software defect or bug is a condition in a s...,yAnswer
1202,15,Yes,I hear you.,yAnswer
1301,164,I am feeling afraid,"Sorry, I still can't answer that.. Could you a...",yAnswer
1544,18,yes,Tell me more.,yAnswer
1724,215,yes,"Sorry, I still can't answer that.. Could you a...",yAnswer


In [103]:
classified_logs.to_csv('tb_v1_dataset_classified.csv')