## Import data 

In [1]:
import math
import warnings
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
import seaborn as sns
from statistics import median 
from statistics import mean
import scipy.stats
from scipy.stats import linregress
%matplotlib inline

In [2]:
#Problem with data:
#- Five month: Sub48 only babbled 6 times. Verified
#- Ten month: Sub43, 74, 85, 94, 97, 99, 113 cg no transcript before 600s. Wrong
#- Ten month: Sub53 cg transcript ends early. Found the correct one


data5 = pd.read_csv("ELAN_elicit_voc_NSF5.csv")
data10 = pd.read_csv("ELAN_elicit_voc_NSF10.csv")
transcript5 = pd.read_csv("transcript5.csv")
transcript10 = pd.read_csv("transcript10.csv")

data5 = pd.DataFrame(data=data5)
data10 = pd.DataFrame(data=data10)
transcript5 = pd.DataFrame(data=transcript5)
transcript10 = pd.DataFrame(data=transcript10)


data5 = data5.replace('CGR Modality_21','CGR Modality')
data5 = data5[data5["tier"]=="VCV"]

In [3]:
transcript5.rename(columns = {'transcript':'cat'}, inplace = True)
transcript10.rename(columns = {'transcript':'cat'}, inplace = True)
transcript5 = transcript5[~transcript5['cat'].str.contains('(', regex=False,na=False)]
transcript5 = transcript5[~transcript5['cat'].str.contains(')', regex=False,na=False)]
transcript10 = transcript10[~transcript10['cat'].str.contains('(', regex=False,na=False)]
transcript10 = transcript10[~transcript10['cat'].str.contains(')', regex=False,na=False)]

In [4]:
data5 = pd.concat([data5, transcript5], ignore_index=True, axis=0)
data10 = pd.concat([data10, transcript10], ignore_index=True, axis=0)

In [5]:
#for sub in subjects:
   # s = transcript10[transcript10["sub"] == sub]
    #print("ID"+str(sub))
    #print(s["onset"].min())
    #print(s["offset"].max())

## Process and Analyze data 

In [6]:
subjects = [4,6,10,11,27,28,43,47,48,53,59,64,71,72,74,75,76,85,87,91,94,96,97,99,101,105,106,113,115,118]
print (len(subjects))
month = {5: data5, 10: data10}

30


### Vocal Maturity: Overall proportions of C babblings @ 5 - 10 months.

In [7]:
for month_num, data in month.items():
    cols = ["sub", "total","C", "Cproportion", "month"]
    df = pd.DataFrame(0, index=np.arange(len(subjects)), columns= cols)
    df["sub"] = subjects
    
    for sub in subjects:
        s = df[df["sub"]== sub].index.values.item()
        subject_frame = data[(data["sub"] == sub)&(data["tier"] == "VCV")]
        subject_frame["CV"] = np.where((subject_frame["cat"] == "V"), "V", "C")
        df.at[s, "month"] = month_num
        if (len(subject_frame) > 0):
            df.at[s, "total"] = len(subject_frame)
        else: 
            df.at[s, "total"] = 0
        if (len(subject_frame[subject_frame["CV"]== "C"])) > 0:
            df.at[s, "C"] = len(subject_frame[subject_frame["CV"]== "C"])
        else: 
            df.at[s, "C"] = 0
        if (len(subject_frame)) > 0:
            df.at[s, "Cproportion"] = len(subject_frame[subject_frame["CV"]== "C"])/len(subject_frame)
        else: 
            df.at[s, "Cproportion"] = "NA"
    globals()['df%s' % month_num] = df  
maturity_long = pd.concat([df5, df10], axis=0)
df5 = df5.rename(columns={"total": "total5", "C": "C5", "Cproportion": "Cproportion5"})
df10 = df10.rename(columns={"total": "total10", "C": "C10", "Cproportion": "Cproportion10"})
maturity = df5.merge(df10, on= ['sub'], how='left')
maturity["maturity_difference_score"] = maturity["Cproportion10"] - maturity["Cproportion5"]
maturity_long.to_csv("maturity_long.csv",index=False) #make graphs in R
maturity.to_csv("maturity.csv",index=False) #make graphs in R

### Caregivers interrupt infant

In [8]:
for month_num, data in month.items():
    #Calculate lag time between infant babbling --> caregiver response
    bhlist = []
    for sub in subjects:
        subject_frame = data[data["sub"] == sub]
        
        cg_turn = subject_frame[subject_frame["tier"]
                           == "CG"].index.values
        inf_turn = subject_frame[(subject_frame["tier"]
                            == "VCV")&(subject_frame["cat"]
                            != "Other")].index.values
        for cg_index in cg_turn:
            cg_offset = subject_frame["offset"][cg_index]
            cg_onset = subject_frame["onset"][cg_index]
            for inf_index in inf_turn:
                inf_offset = subject_frame["offset"][inf_index]
                inf_onset = subject_frame["onset"][inf_index]
                if (cg_onset >= inf_onset) & (cg_onset < inf_offset):
                    cg = subject_frame["cat"][cg_index]
                    cat = subject_frame["cat"][inf_index]
                    lag = inf_offset - cg_onset
                    interrupt = 1
                    df = [sub,cat,lag,interrupt,inf_onset,inf_offset,cg,cg_onset,cg_offset]
                    bhlist.append(df)
    df_latency = pd.DataFrame(bhlist, columns = ['sub','cat','lag',"interrupt",'inf_onset','inf_offset','cgtype','cg_onset','cg_offset'])
    df_latency_2 = pd.DataFrame(columns = [])
    for sub in subjects:
        df = df_latency[(df_latency["sub"] == sub)]
        df.sort_values("cg_onset", inplace = True)
        df.drop_duplicates(subset ="inf_onset", keep = 'first', inplace = True)
        df_latency_2 = df_latency_2.append(df)
    all_inf = data[(data["tier"] == "VCV")&(data['cat']!='Other')]
    all_inf = all_inf.rename(columns={"onset": "inf_onset", "offset": "inf_offset"})
    all_inf = all_inf.merge(df_latency_2, on= ['sub','cat','inf_onset',"inf_offset"], how='left')
    all_inf = all_inf[["sub", "cat", "lag","interrupt", "inf_onset", 'inf_offset',"cgtype",'cg_onset','cg_offset']]
    all_inf['interrupt'] = all_inf['interrupt'].fillna(0)
    all_inf["CV"] = np.where((all_inf["cat"] == "V"), "V", "C")
    globals()['allcg%s' % month_num]= all_inf

### Infants interrupt caregivers

In [9]:
for month_num, data in month.items():
    #Calculate lag time between caregiver speech ----> infant babbling
    bhlist = []
    for sub in subjects:
        subject_frame = data[data["sub"] == sub]
        cg_turn = subject_frame[subject_frame["tier"]
                           == "CG"].index.values
        inf_turn = subject_frame[(subject_frame["tier"]
                            == "VCV")&(subject_frame["cat"]
                            != "Other")].index.values
        for cg_index in cg_turn:
            cg_offset = subject_frame["offset"][cg_index]
            cg_onset = subject_frame["onset"][cg_index]
            for inf_index in inf_turn:
                inf_offset = subject_frame["offset"][inf_index]
                inf_onset = subject_frame["onset"][inf_index]
                if (cg_onset <= inf_onset) & (cg_offset > inf_onset):
                    cg = subject_frame["cat"][cg_index]
                    cat = subject_frame["cat"][inf_index]
                    lag = cg_offset- inf_onset
                    interrupt = 1
                    df = [sub,cat,lag,interrupt,inf_onset,inf_offset,cg,cg_onset,cg_offset]
                    bhlist.append(df)
    df_latency = pd.DataFrame(bhlist, columns = ['sub','cat','lag',"interrupt",'inf_onset','inf_offset','cgtype','cg_onset','cg_offset'])
    all_inf = data[(data["tier"] == "VCV")&(data['cat']!='Other')]
    all_inf = all_inf.rename(columns={"onset": "inf_onset", "offset": "inf_offset"})
    all_inf = all_inf.merge(df_latency, on= ['sub','cat','inf_onset',"inf_offset"], how='left')
    all_inf = all_inf[["sub", "cat", "lag","interrupt", "inf_onset", 'inf_offset',"cgtype",'cg_onset','cg_offset']]
    all_inf['interrupt'] = all_inf['interrupt'].fillna(0)
    #all_inf["CV"] = np.where((all_inf["cat"] == "V"), "V", "C")
    globals()['allinf%s' % month_num]= all_inf

### Likelihood of interruption: Proportions of babbling interrupted by caregivers @ 5- 10mon
### Magnitude of interruption: Interruption duration by caregivers @ 5- 10mon

In [10]:
allcg5["month"] = "Five"
allcg10["month"] = "Ten"
allcg = pd.concat([allcg5, allcg10], ignore_index=True, axis=0)
allcg.to_csv("allcg.csv",index=False)

In [11]:
allinf5["month"] = "Five"
allinf10["month"] = "Ten"
allinf = pd.concat([allinf5, allinf10], ignore_index=True, axis=0)
allinf.to_csv("allinf.csv",index=False)

In [12]:
#Overall
alloverall5 = pd.concat([allcg5, allinf5], ignore_index=True, axis=0)
alloverall10 = pd.concat([allcg10, allinf10], ignore_index=True, axis=0)
alloverall = pd.concat([allcg, allinf], ignore_index=True, axis=0)
alloverall.to_csv("alloverall.csv",index=False)

#### Both V and C

In [13]:
# Caregiver interrupt infant
month2 = {5: allcg5, 10: allcg10}
for month_num, data in month2.items():
    count = []
    for sub in subjects:
        subject_frame = data[data["sub"] == sub]
        total = len(subject_frame)
        interrupted = subject_frame[subject_frame["interrupt"] == 1]
        if (total == 0):
            proportion = "NA"
        else:
            proportion = len(interrupted)/total
        duration = interrupted["lag"].mean()
        l = [sub, month_num, total, len(interrupted), proportion, duration]
        count.append(l)
    globals()['babbling%s' % month_num] = pd.DataFrame(count, columns = ['sub', 'month','total','interrupt',"proportion", "duration"])

cg_interruption_long = pd.concat([babbling5, babbling10], ignore_index=True, axis=0)
babbling5 = babbling5.rename(columns={"total": "total5", "interrupt": "interrupt5", "proportion": "proportion5", "duration": "duration5"})
babbling10 = babbling10.rename(columns={"total": "total10", "interrupt": "interrupt10", "proportion": "proportion10", "duration": "duration10"})

cg_interruption = babbling5.merge(babbling10, on= ['sub'], how='left')
cg_interruption["proportion_difference_score"] = cg_interruption["proportion10"] - cg_interruption["proportion5"]
cg_interruption["duration_difference_score"] = cg_interruption["duration10"] - cg_interruption["duration5"]

cg_interruption_long.to_csv("cg_interruption_long.csv",index=False)
cg_interruption.to_csv("cg_interruption.csv",index=False)#make graphs in R

In [14]:
# Infant interrupt caregiver
month2 = {5: allinf5, 10: allinf10}
for month_num, data in month2.items():
    count = []
    for sub in subjects:
        subject_frame = data[data["sub"] == sub]
        total = len(subject_frame)
        interrupted = subject_frame[subject_frame["interrupt"] == 1]
        if (total == 0):
            proportion = "NA"
        else:
            proportion = len(interrupted)/total
        duration = interrupted["lag"].mean()
        l = [sub, month_num, total, len(interrupted), proportion, duration]
        count.append(l)
    globals()['babbling%s' % month_num] = pd.DataFrame(count, columns = ['sub', 'month','total','interrupt',"proportion", "duration"])

inf_interruption_long = pd.concat([babbling5, babbling10], ignore_index=True, axis=0)
babbling5 = babbling5.rename(columns={"total": "total5", "interrupt": "interrupt5", "proportion": "proportion5", "duration": "duration5"})
babbling10 = babbling10.rename(columns={"total": "total10", "interrupt": "interrupt10", "proportion": "proportion10", "duration": "duration10"})

inf_interruption = babbling5.merge(babbling10, on= ['sub'], how='left')
inf_interruption["proportion_difference_score"] = inf_interruption["proportion10"] - inf_interruption["proportion5"]
inf_interruption["duration_difference_score"] = inf_interruption["duration10"] - inf_interruption["duration5"]

inf_interruption_long.to_csv("inf_interruption_long.csv",index=False)
inf_interruption.to_csv("inf_interruption.csv",index=False)#make graphs in R

In [15]:
# Overall
month2 = {5: alloverall5, 10: alloverall10}
for month_num, data in month2.items():
    count = []
    for sub in subjects:
        subject_frame = data[data["sub"] == sub]
        total = len(subject_frame)
        interrupted = subject_frame[subject_frame["interrupt"] == 1]
        if (total == 0):
            proportion = "NA"
        else:
            proportion = len(interrupted)/total
        duration = interrupted["lag"].mean()
        l = [sub, month_num, total, len(interrupted), proportion, duration]
        count.append(l)
    globals()['babbling%s' % month_num] = pd.DataFrame(count, columns = ['sub', 'month','total','interrupt',"proportion", "duration"])

overall_interruption_long = pd.concat([babbling5, babbling10], ignore_index=True, axis=0)
babbling5 = babbling5.rename(columns={"total": "total5", "interrupt": "interrupt5", "proportion": "proportion5", "duration": "duration5"})
babbling10 = babbling10.rename(columns={"total": "total10", "interrupt": "interrupt10", "proportion": "proportion10", "duration": "duration10"})

overall_interruption = babbling5.merge(babbling10, on= ['sub'], how='left')
overall_interruption["proportion_difference_score"] = overall_interruption["proportion10"] - overall_interruption["proportion5"]
overall_interruption["duration_difference_score"] = overall_interruption["duration10"] - overall_interruption["duration5"]

overall_interruption_long.to_csv("overall_interruption_long.csv",index=False)
overall_interruption.to_csv("overall_interruption.csv",index=False)#make graphs in R

#### Vowel

In [46]:
all5_V = all5[all5["CV"] == "V"]
all10_V = all10[all10["CV"] == "V"]
all_V = pd.concat([all5_V, all10_V], ignore_index=True, axis=0)
all_V.to_csv("all_V.csv",index=False)

In [45]:
month2 = {5: all5_V, 10: all10_V}
for month_num, data in month2.items():
    count = []
    for sub in subjects:
        subject_frame = data[data["sub"] == sub]
        total = len(subject_frame)
        interrupted = subject_frame[subject_frame["interrupt"] == 1]
        if (total == 0):
            proportion = "NA"
        else:
            proportion = len(interrupted)/total
        duration = interrupted["lag"].mean()
        l = [sub, month_num, total, len(interrupted), proportion, duration]
        count.append(l)
    globals()['babbling%s' % month_num] = pd.DataFrame(count, columns = ['sub', 'month','total','interrupt',"proportion", "duration"])

interruption_long = pd.concat([babbling5, babbling10], ignore_index=True, axis=0)
babbling5 = babbling5.rename(columns={"total": "total5", "interrupt": "interrupt5", "proportion": "proportion5", "duration": "duration5"})
babbling10 = babbling10.rename(columns={"total": "total10", "interrupt": "interrupt10", "proportion": "proportion10", "duration": "duration10"})

interruption = babbling5.merge(babbling10, on= ['sub'], how='left')
interruption["proportion_difference_score"] = interruption["proportion10"] - interruption["proportion5"]
interruption["duration_difference_score"] = interruption["duration10"] - interruption["duration5"]

interruption_long.to_csv("interruption_long_V.csv",index=False)
interruption.to_csv("interruption_V.csv",index=False)#make graphs in R

#### Consonant

In [47]:
all5_C = all5[all5["CV"] == "C"]
all10_C = all10[all10["CV"] == "C"]
all_C = pd.concat([all5_C, all10_C], ignore_index=True, axis=0)
all_C.to_csv("all_C.csv",index=False)

In [51]:
month2 = {5: all5_C, 10: all10_C}
for month_num, data in month2.items():
    count = []
    for sub in subjects:
        subject_frame = data[data["sub"] == sub]
        total = len(subject_frame)
        interrupted = subject_frame[subject_frame["interrupt"] == 1]
        if (total == 0):
            proportion = "NA"
        else:
            proportion = len(interrupted)/total
        duration = interrupted["lag"].mean()
        l = [sub, month_num, total, len(interrupted), proportion, duration]
        count.append(l)
    globals()['babbling%s' % month_num] = pd.DataFrame(count, columns = ['sub', 'month','total','interrupt',"proportion", "duration"])

interruption_long = pd.concat([babbling5, babbling10], ignore_index=True, axis=0)
babbling5 = babbling5.rename(columns={"total": "total5", "interrupt": "interrupt5", "proportion": "proportion5", "duration": "duration5"})
babbling10 = babbling10.rename(columns={"total": "total10", "interrupt": "interrupt10", "proportion": "proportion10", "duration": "duration10"})

interruption = babbling5.merge(babbling10, on= ['sub'], how='left')
#interruption["proportion_difference_score"] = interruption["proportion10"] - interruption["proportion5"]
interruption["duration_difference_score"] = interruption["duration10"] - interruption["duration5"]

interruption_long.to_csv("interruption_long_C.csv",index=False)
interruption.to_csv("interruption_C.csv",index=False)#make graphs in R

In [None]:
interruption_long

In [193]:
#df_latency_2 = df_latency_2.dropna()
#df_latency_2 = df_latency_2.astype(str)
all_inf

Unnamed: 0,sub,cat,lag,interrupt,inf_onset,inf_offset,cgtype,cg_onset,cg_offset
0,4,V,,0.0,10.750,10.980,,,
1,4,V,,0.0,17.780,19.500,,,
2,4,V,,0.0,20.145,23.095,,,
3,4,CV,,0.0,26.615,26.955,,,
4,4,V,,0.0,27.890,28.390,,,
...,...,...,...,...,...,...,...,...,...
2639,118,V,,0.0,1773.670,1774.650,,,
2640,118,V,,0.0,1775.840,1776.130,,,
2641,118,V,,0.0,1791.745,1791.915,,,
2642,118,V,,0.0,1801.870,1802.900,,,
