In [1]:
#Load modules and import data from 3 experiments
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
from __future__ import division

def load_df(filename):
    df = pd.DataFrame.from_csv(filename)
    #df = pd.DataFrame.from_csv('C:/Experiments/JK302b/dataOut/bigPJK302b_6subs.csv')
    #df = pd.DataFrame.from_csv('C:/Experiments/JK307/dataOut/bigP_r1.csv')

    #Relabel babble condition for clarity
    df.loc[df['VideoCond'] == 'AV',('VideoCond')] = 'Audiovisual'
    df.loc[df['VideoCond'] == 'AO',('VideoCond')] = 'Auditory Only'


    #Make WordIdx start from 1, not zero
    df.WordIdx = df.WordIdx+1

    # Fix phoneme alignment issue (correct word given but phonemes assigned to wrong word)
    # Find where words match
    matchIdx = df['SourceWord'] == df['TargetWord']
    #Set the source phonemes to match the target
    df.loc[matchIdx,('SourcePhoneme')] = df.loc[matchIdx,('TargetPhoneme')] 
    #Reset the measure of phoneme accuracy
    df.loc[:,'PhonemeHitBool'] = df['SourcePhoneme'] == df['TargetPhoneme']
    #Set a measure all phonemes matching
    df.loc[:,('AllPhonsMatch')] = df.groupby('WordCount')['PhonemeHitBool'].transform(lambda x: np.mean(x) ==1)
    #Set a measure all phonemes missing
    df.loc[:,('AllPhonsMiss')] = df.groupby('WordCount')['PhonemeHitBool'].transform(lambda x: np.mean(x) ==0)
    # Make a column for the number of words in the sentence
    df['NumWordsInSentence'] = df.groupby(['SentenceCount'])['WordIdx'].transform(max)
    # Make a column for the number of phonemes in the word
    df['NumPhonemesInWord'] = df.groupby(['WordCount'])['PhonemeIndex'].transform(lambda x: np.max(x)-np.min(x)+1)
    #Position of the word in relation to the last word of the sentence
    df['WordFromLast'] = df['NumWordsInSentence']-df['WordIdx']
    # Calculate dB from RMS values
    import math
    def amp2db(scalar):
        return 20 * math.log10(scalar)
    df['SpeechdB'] =  df['SpeechRMS'].apply(lambda x: amp2db(x) if x >=0 else 0)
    df['SpeechdBRel'] = df[['SpeechdB','SentenceCount']].groupby('SentenceCount').apply(lambda x: x-np.mean(x))['SpeechdB']
    
    dfPT = pd.read_excel(os.path.normpath('C:\TCDTIMIT\Tables\Custom\TablesPhoneme_r5.xlsx'),encoding='latin-1')
    #df = df[df['TargetPhoneme'].isin(dfPT['CMU Phonemes'])]
    df = pd.merge(df, dfPT, how='inner', left_on = 'TargetPhoneme',right_on = 'CMU Phonemes' )
    return df
#df.rename(columns={'0':'PENNPOS'}, inplace=True)
#df.rename(columns={'0.1':'UPOS'}, inplace=True)
df_name = []
df_name.append('C:/Experiments/JK302/dataOut/bigP_24_r3.csv')
df_name.append('C:/Experiments/JK302b/dataOut/bigPJK302b_6subs.csv')
df_name.append('C:/Experiments/JK307/dataOut/bigP_r1.csv')
df_name.append('C:/Experiments/JK310/dataOut/bigP_12sub.csv')
df_name.append('C:/Experiments/JK311/dataOut/bigP_12sub.csv')
df_label = []
df_label.append('Main')
df_label.append('Babble_Shaped_Noise')
df_label.append('Flattened_Volume')
df_label.append('Probe Task Delay')
df_label.append('Probe Task BeforeAfter')
dfs = []
dfs.append(load_df(df_name[0]))
dfs.append(load_df(df_name[1]))
dfs.append(load_df(df_name[2]))
dfs.append(load_df(df_name[3]))
dfs.append(load_df(df_name[4]))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
df = dfs[0]

In [3]:
df.keys


<bound method DataFrame.keys of         level_0  Unnamed: 0.1  index  TotalTrialOrder Talker SentenceType  \
0             0             0      0                1   s01M           sx   
1             3             3      3                4   s01M           si   
2             3             3      3                4   s01M           si   
3             6             6      6                7   s01M           si   
4             7             7      7                8   s01M           si   
5             9             9      9               10   s01M           si   
6            12            12     12               13   s01M           sx   
7            12            12     12               13   s01M           sx   
8            13            13     13               14   s01M           si   
9            14            14     14               15   s01M           si   
10           17            17     17               18   s01M           si   
11           17            17     17        

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= sklearn.model_selection.train_test_split(
    X, y, 
    test_size=0.5, 
    random_state=0) 

print("No. Rows in training set:\t", len(X_train))
print("No. Rows in testing set:\t" , len(X_test))