In [1]:
from nltk import data
from nltk.corpus.reader import CHILDESCorpusReader
from nltk.probability import FreqDist
import entropies7.entropies7 as Ent
from numpy import *
import scipy as sp
import numpy as np
import re

%matplotlib inline
from pandas import *
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
nmmfile = "/Users/jeremyirvin/Desktop/SeniorThesis/Childes/nltk_childes/NLTKCHILDES/morph-eng.csv"
nmsfile = "/Users/jeremyirvin/Desktop/SeniorThesis/Childes/nltk_childes/NLTKCHILDES/syntax-eng.csv"



In [3]:
def get_files(corpus):
    if corpus =='childes':
        corpus_root = data.find('corpora/childes/Eng-USA-MOR')
#         corpus_root = data.find('corpora/childes/English-UK-MOR')
        eng = CHILDESCorpusReader(corpus_root, 'Manchester/.*.xml')
#         nmmfile = "/Users/dspoka/Desktop/moscoso/nltk_childes/NLTKCHILDES/morph-eng.csv"
#         nmsfile = "/Users/dspoka/Desktop/moscoso/nltk_childes/NLTKCHILDES/syntax-eng.csv"

        files = eng.fileids()
    return eng, files
# %timeit files = get_files('childes')
eng, files = get_files('childes')
# childes has 804 files

In [4]:
def convert_age_to_days(age):
    parsed_age = re.split('P|Y|M|D',age)
    years = parsed_age[1]
    months = parsed_age[2]
    days = parsed_age[3]
    if days == '':
        days = 0
    return int(years)*365+int(months)*30+int(days)

In [6]:
def compare(f1, f2):
    age1 = convert_age_to_days(eng.age(f1)[0])
    age2 = convert_age_to_days(eng.age(f2)[0])
    if age1 < age2:
        return -1
    elif age2 < age1:
        return 1
    else:
        return 0
   
def group_files_by_child_age(files):
    # Group files by age, child, and corpus
    resDict = {}
    for f in files:
        child = f.split("/")[1]
        if resDict.has_key(child):
            resDict[child] += [f]
        else:
            resDict[child] = [f]
    for child in resDict.keys():
        resDict[child] = sorted(resDict[child], cmp=compare)
    
    return resDict

# %timeit resDict = group_files_by_child_age(files)
resDict = group_files_by_child_age(files)

In [7]:
resDict

{u'anne': [u'Manchester/anne/anne01a.xml',
  u'Manchester/anne/anne01b.xml',
  u'Manchester/anne/anne02a.xml',
  u'Manchester/anne/anne02b.xml',
  u'Manchester/anne/anne03a.xml',
  u'Manchester/anne/anne03b.xml',
  u'Manchester/anne/anne04a.xml',
  u'Manchester/anne/anne04b.xml',
  u'Manchester/anne/anne05a.xml',
  u'Manchester/anne/anne05b.xml',
  u'Manchester/anne/anne06a.xml',
  u'Manchester/anne/anne06b.xml',
  u'Manchester/anne/anne07a.xml',
  u'Manchester/anne/anne07b.xml',
  u'Manchester/anne/anne08a.xml',
  u'Manchester/anne/anne08b.xml',
  u'Manchester/anne/anne09a.xml',
  u'Manchester/anne/anne09b.xml',
  u'Manchester/anne/anne10a.xml',
  u'Manchester/anne/anne10b.xml',
  u'Manchester/anne/anne11a.xml',
  u'Manchester/anne/anne11b.xml',
  u'Manchester/anne/anne12a.xml',
  u'Manchester/anne/anne12b.xml',
  u'Manchester/anne/anne13a.xml',
  u'Manchester/anne/anne13b.xml',
  u'Manchester/anne/anne14a.xml',
  u'Manchester/anne/anne14b.xml',
  u'Manchester/anne/anne15a.xml',
  u'M

In [8]:
def get_mean_length_utterances(file_name):
    # Syntactic Diversity
    child_sent = eng.sents(file_name, speaker="CHI",replace=True)
    for i in range(len(child_sent)):
        child_sent[i] = len(child_sent[i])
    Schild = np.mean(child_sent)

    mother_sent = eng.sents(file_name, speaker="MOT",replace=True)
    for i in range(len(mother_sent)):
        mother_sent[i] = len(mother_sent[i])
    Smother = np.mean(mother_sent)
    return Schild, Smother

In [23]:
def make_freq_dist_files(window_files):
#  Takes a list, where every item is a file
# return a Freq dist for the both the child and mother
    child_words = []
    mother_words = []
    for window_file in window_files:
        child_words += eng.words(window_file,speaker="CHI",replace=True)
        mother_words += eng.words(window_file,speaker="MOT",replace=True)
    return FreqDist(child_words), FreqDist(mother_words)

def make_freq_dist_files_stem(window_files):
#  Takes a list, where every item is a file
# return a Freq dist for the both the child and mother stemmed
    child_words_stem = []
    mother_words_stem = []
    for window_file in window_files:
        child_words_stem += [w.split("-")[0] for w in eng.words(window_file,speaker="CHI",replace=True,stem=True)]
        mother_words_stem += [w.split("-")[0] for w in eng.words(window_file,speaker="MOT",replace=True,stem=True)]
    return FreqDist(child_words_stem), FreqDist(mother_words_stem)

def make_windows(child, window_size):
    list_windows = []
    if len(resDict[child]) < window_size:
#         Not Allowed!!
        return False
    else:
        i = 0
        while(i + window_size < len(resDict[child])):
            list_windows.append(resDict[child][i:i+window_size])
            i += 1
    return list_windows

def window_to_weighted_age(window_files):
    weighted_age = 0
    number_of_words = []
    for window_file in window_files:
        n = sum(array((FreqDist(eng.words(window_file,speaker="CHI",replace=True))).values()))
        number_of_words.append(n)
        window_age = convert_age_to_days(eng.age(window_file)[0]) * n
        weighted_age += window_age
    weighted_age /= sum(number_of_words) 
    return weighted_age

In [24]:
fout = open(nmmfile,"w")
print >> fout, "Child Age N.child H.child H.child.S H.child.I N.mother H.mother H.mother.S H.mother.I"
for child in resDict.keys():
    for window in make_windows(child, 5):
        age = window_to_weighted_age(window)
        
        fchild, fmother = make_freq_dist_files(window)
        fchildS, fmotherS = make_freq_dist_files_stem(window)

        # Statistics
        nchild = sum(array(fchild.values()))
        nmother = sum(array(fmother.values()))
        # Entropies
        Hchild = Ent.Entropy(fchild,method="CWJ")
        Hmother = Ent.Entropy(fmother,method="CWJ")
        # Entropies (stemmed - Lexical Diversity)
        HchildS = Ent.Entropy(fchildS,method="CWJ")
        HmotherS = Ent.Entropy(fmotherS,method="CWJ")
        # Inflectional Diversity
        HchildI = Hchild - HchildS
        HmotherI = Hmother - HmotherS
        
#         Schild, Smother = get_mean_length_utterances(f)
        
        print child,age,nchild,Hchild,HchildS,HchildI,nmother,Hmother,HmotherS,HmotherI
        print >> fout, child,nchild,Hchild,HchildS,HchildI,nmother,Hmother,HmotherS,HmotherI

fout.close()

becky 743 2408 4.51078389653 4.43095455828 0.0798293382441 8552 5.47619986418 4.92991700787 0.546282856313
becky 748 2457 4.44638572078 4.35522278633 0.0911629344446 8963 5.46834918094 4.93587788707 0.532471293872
becky 754 2405 4.57289042237 4.47825540096 0.0946350214086 9679 5.50354239882 4.97830336798 0.525239030838
becky 760 2547 4.60882053205 4.5075133719 0.101307160149 9152 5.49456815239 4.97220287268 0.522365279712
becky 768 2623 4.6971741624 4.59254189251 0.104632269888 9627 5.51770370902 4.99762289689 0.520080812131
becky 773 2771 4.75687442263 4.652039385 0.104835037624 9425 5.52252919867 4.98469308628 0.537836112386
becky 780 2797 4.85644605598 4.74999948029 0.106446575688 9909 5.50122589072 4.96867833085 0.532547559871
becky 783 2739 4.78178094541 4.68474820833 0.0970327370766 9810 5.48886504034 4.94010380262 0.548761237711
becky 790 2673 4.85630713478 4.74287475662 0.113432378162 9922 5.50512228328 4.94219119103 0.562931092254
becky 795 2728 4.81659788118 4.69842485999 0.1

KeyboardInterrupt: 

In [None]:
%matplotlib inline
from pandas import *
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

# Read data as a ssv (space-separated file)
df = pandas.read_csv("/Users/jeremyirvin/Desktop/SeniorThesis/Childes/nltk_childes/NLTKCHILDES/morph-eng.csv", delimiter= ' ')

# Sort data by children alphabetically, then age increasing
sorted_df = df.sort(columns=['Child', 'Age'])

# sorted_df.to_csv("/Users/jeremyirvin/Desktop/SeniorThesis/Childes/nltk_childes/sorted-morph-eng.csv", sep = ' ');

# Get list of the names of the children
name_list = Series(sorted_df['Child']).unique()

# Sort the name list
sorted_name_list = sorted(name_list)

# Get list of the names of the columns
column_list = list(sorted_df.columns.values)

# Remove the column name 'Child' because it contains strings
column_list.remove('Child')

# Convert all other columns to floats
sorted_df[column_list] = sorted_df[column_list].astype(float)

# Create a list of data frames corresponding to each child
child_split_df = {}
for child in sorted_name_list:
    child_split_df[child] = sorted_df[sorted_df['Child'] == child]

# Plot the data nicely    
color_list = ['b','g','r','c','m','y','k'] 
    
# Define a function which plots a certain statistic for a certain child
def plot_stat(axarr, name, child_dfs_after_window, stat, row_num, column_num, color_num):
    axarr[row_num, column_num].plot((child_dfs_after_window[name])['Age'],  (child_dfs_after_window[name])[stat], linestyle = '-', color=color_list[color_num % 7])
    axarr[row_num, column_num].set_xlabel('Age')
    axarr[row_num, column_num].set_ylabel(stat)
    axarr[row_num, column_num].set_title(name + ": " + stat)

column_list.remove('Age')
name_num = 1

fig, axarr = plt.subplots(2 * len(name_list), 5)
fig.set_size_inches(24, 120)
row_num = 0
color_num = 0

for name in name_list:
    column_num = 0
    for stat in column_list:
        plot_stat(axarr, name, child_dfs_after_window, stat, row_num, column_num, color_num)
        column_num += 1
        if(column_num == 5):
            row_num += 1
            column_num = 0
    color_num += 1

plt.show()
beaker.dict_child_df = child_dfs_after_window