Import required packages

In [None]:
import unittest, config, MySQLdb, pandas, warnings, numpy, nltk
from childes import CHILDESCorpusReader # import from nltk.corpus.reader for original

Connect to DB

In [None]:
chompsky_authenticator = config.Authenticator('Chompsky')
chompsky_con= MySQLdb.connect(host=chompsky_authenticator.host, 
                port=chompsky_authenticator.port,user=chompsky_authenticator.user, passwd=chompsky_authenticator.passwd, 
                db='childes')

Configure corpus & files to test

In [None]:
region = 'Eng-NA' # or Eng-UK
corpus_name = 'Sachs'
fileid_name = 'n02'

Set up corpus information

In [None]:
corpus_root = '/home/alsan/corpora/childes-xml/%s' % region
corpus = CHILDESCorpusReader(corpus_root, '%s/.*.xml' % corpus_name)
fileid = ['%s/%s.xml' % (corpus_name, fileid_name)]
filename = '/shared_hd0/corpora/childes_new/%s/n02.cha' % corpus_name

Test if words / sentences / mors in CHILDES-DB are equal to those outputted by NLTK reader

In [None]:
class NLTKTest(unittest.TestCase):
    
    def setUp(self):
        sql_words = " SELECT gloss, mor FROM words \
                      WHERE filename = %(filename)s "
        sql_sents = " SELECT DISTINCT sentgloss, sentmor FROM words \
                      WHERE filename = %(filename)s "
        
        self.words_df = pandas.read_sql(sql_words, chompsky_con, params={'filename':filename})
        self.sents_df = pandas.read_sql(sql_sents, chompsky_con, params={'filename':filename})
        
    def test_word_difference(self):
        words_db = self.words_df['gloss'].values.T.tolist()
        
        words_nltk = corpus.words(fileid)
        
        words_in_db_not_nltk = set(words_db) - set(words_nltk)
        words_in_nltk_not_db = set(words_nltk) - set(words_db)
                
        self.assertEqual(words_in_db_not_nltk, words_in_nltk_not_db)
            
    def test_word_mor_difference(self):
        tagged_words_nltk = corpus.tagged_words(fileid, stem=True)
        word_mors_nltk = [p[1] + "|" + p[0] for p in tagged_words_nltk]
        
        word_mors_db = self.words_df['mor'].values.T.tolist()
        
        mors_in_db_not_nltk = set(word_mors_db) - set(word_mors_nltk)
        mors_in_nltk_not_db = set(word_mors_nltk) - set(word_mors_db)
        
        self.assertEqual(mors_in_db_not_nltk, mors_in_nltk_not_db)
        
    def test_sent_difference(self):
        sents_db = self.sents_df['sentgloss'].values.T.tolist()
        
        sents_nltk = [" ".join(sent_arr) for sent_arr in corpus.sents(fileid)]
        
        sents_in_db_not_nltk = set(sents_db) - set(sents_nltk)
        sents_in_nltk_not_db = set(sents_nltk) - set(sents_db)
        
        self.assertEqual(sents_in_db_not_nltk, sents_in_nltk_not_db)
        
    def test_sent_mor_difference(self):
        tagged_sents_nltk = corpus.tagged_sents(fileid, stem=True)
        sent_mors_nltk = [" ".join( [p[1] + "|" + p[0] for p in arr] ) for arr in tagged_sents_nltk]
        
        sent_mors_db = self.sents_df['sentmor'].values.T.tolist()
        
        mors_in_db_not_nltk = set(sent_mors_db) - set(sent_mors_nltk)
        mors_in_nltk_not_db = set(sent_mors_nltk) - set(sent_mors_db)
        
        self.assertEqual(mors_in_db_not_nltk, mors_in_nltk_not_db)
        
def suite():
    test_suite = unittest.TestSuite()
    test_suite.addTest(unittest.makeSuite(NLTKTest))
    return test_suite

Run test suite

In [None]:
test_suite = suite()
runner=unittest.TextTestRunner()
runner.run(test_suite)