Import required packages

In [6]:
import unittest, config, MySQLdb, pandas, warnings, numpy, nltk
from childes import CHILDESCorpusReader # import from nltk.corpus.reader for original

Connect to DB

In [7]:
chompsky_authenticator = config.Authenticator('Chompsky')
chompsky_con= MySQLdb.connect(host=chompsky_authenticator.host, 
                port=chompsky_authenticator.port,user=chompsky_authenticator.user, passwd=chompsky_authenticator.passwd, 
                db='childes')

Configure corpus & files to test

In [8]:
region = 'Eng-NA' # or Eng-UK
corpus_name = 'Sachs'
fileid_name = 'n02'

Set up corpus information

In [9]:
corpus_root = '/home/alsan/corpora/childes-xml/%s' % region
corpus = CHILDESCorpusReader(corpus_root, '%s/.*.xml' % corpus_name)
fileid = ['%s/%s.xml' % (corpus_name, fileid_name)]
filename = '/shared_hd0/corpora/childes_new/%s/n02.cha' % corpus_name

Test if words / sentences / mors in CHILDES-DB are equal to those outputted by NLTK reader

In [13]:
class NLTKTest(unittest.TestCase):
    
    def setUp(self):
        sql_words = " SELECT gloss, mor FROM words \
                      WHERE filename = %(filename)s "
        sql_sents = " SELECT DISTINCT sentgloss, sentmor FROM words \
                      WHERE filename = %(filename)s "
        
        self.words_df = pandas.read_sql(sql_words, chompsky_con, params={'filename':filename})
        self.sents_df = pandas.read_sql(sql_sents, chompsky_con, params={'filename':filename})
        
    def test_word_difference(self):
        words_db = self.words_df['gloss'].values.T.tolist()
        
        words_nltk = corpus.words(fileid)
                
        self.assertEqual(set(words_db), set(words_nltk))
            
    def atest_word_mor_difference(self):
        word_mors_db = self.words_df['mor'].values.T.tolist()
        
        tagged_words_nltk = corpus.tagged_words(fileid, stem=True)
        word_mors_nltk = [p[1] + "|" + p[0] for p in tagged_words_nltk]
        
        self.assertEqual(set(word_mors_db), set(word_mors_nltk))
        
    def atest_sent_difference(self):
        sents_db = self.sents_df['sentgloss'].values.T.tolist()
        
        sents_nltk = [" ".join(sent_arr) for sent_arr in corpus.sents(fileid)]
        
        self.assertEqual(set(sents_db), set(sents_nltk))
        
    def atest_sent_mor_difference(self):
        sent_mors_db = self.sents_df['sentmor'].values.T.tolist()
        
        tagged_sents_nltk = corpus.tagged_sents(fileid, stem=True)
        sent_mors_nltk = [" ".join( [p[1] + "|" + p[0] for p in arr] ) for arr in tagged_sents_nltk]
        
        self.assertEqual(set(sent_mors_db), set(sent_mors_nltk))
        
def suite():
    test_suite = unittest.TestSuite()
    test_suite.addTest(unittest.makeSuite(NLTKTest))
    return test_suite

Run test suite

In [14]:
test_suite = suite()
runner=unittest.TextTestRunner()
runner.run(test_suite)

F
FAIL: test_word_difference (__main__.NLTKTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-13-b956e803621a>", line 18, in test_word_difference
    self.assertEqual(set(words_db), set(words_nltk))
AssertionError: Items in the first set but not the second:
'&hmm'
'&aw'
'&eh'
'0'
'&ah'
'&mm'
'&um'
'&uh'
Items in the second set but not the first:
'um'
'eh'
u'mm'
u'ah'
u'sipper'
u'boydie'
u'dat'
u'hm'
u'blankin'
'uh'
'aw'
u'whatcha'

----------------------------------------------------------------------
Ran 1 test in 0.180s

FAILED (failures=1)


<unittest.runner.TextTestResult run=1 errors=0 failures=1>