In [2]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
import statsmodels.stats.multicomp as ml

  from pandas.core import datetools


In [3]:
# Read in data file, make it into a dataframe
f=pd.read_csv("../deception_data/n151_2017/liwc_au.csv")

In [4]:
# keep only the AU's, filename, and question
keep_col = ['Filename','question','AU01_r','AU02_r','AU04_r','AU05_r','AU06_r','AU07_r','AU09_r','AU10_r','AU12_r','AU14_r','AU15_r','AU17_r','AU20_r','AU23_r','AU25_r','AU26_r','AU45_r']
new_f = f[keep_col]
new_f.to_csv("old_data_au.csv", index=False)

In [5]:
## Make a subset of the data that includes only the baseline questions
df = new_f[(new_f['question'] < 5)]

In [6]:
print(df)

                                               Filename  question    AU01_r  \
0       2016-03-11_16-16-40-42-W-B-pamela3_openface.csv         0  0.178387   
1       2016-03-11_16-16-40-42-W-B-pamela3_openface.csv         1  0.272211   
2       2016-03-11_16-16-40-42-W-B-pamela3_openface.csv         2  0.317124   
3       2016-03-11_16-16-40-42-W-B-pamela3_openface.csv         3  0.002513   
4       2016-03-11_16-16-40-42-W-B-pamela3_openface.csv         4  0.006746   
20    2016-03-15_18-15-13-468-W-T-Sprinkles_openface...         0  0.117715   
21    2016-03-15_18-15-13-468-W-T-Sprinkles_openface...         1  0.305589   
22    2016-03-15_18-15-13-468-W-T-Sprinkles_openface...         2  0.073655   
23    2016-03-15_18-15-13-468-W-T-Sprinkles_openface...         3  0.000000   
24    2016-03-15_18-15-13-468-W-T-Sprinkles_openface...         4  0.216964   
46     2016-03-16_10-05-49-922-W-T-tarples_openface.csv         0  0.514009   
47     2016-03-16_10-05-49-922-W-T-tarples_openface.

In [7]:
# Function to label each  question based on mental state
def label_question (row):
   if row['question'] == 0 :
      return 'ambiguous'
   if row['question'] == 1 :
      return 'baseline_truth'
   if row['question'] == 2 :
      return 'memory_recall'
   if row['question'] == 3 :
      return 'cognitive'
   if row['question'] == 4 :
      return 'uncomfortable'
   return 'Other'

In [8]:
df['question_label'] = df.apply(lambda row: label_question (row),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
df.to_csv("baseline_questions.csv", index=False)

In [10]:
df

Unnamed: 0,Filename,question,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,AU45_r,question_label
0,2016-03-11_16-16-40-42-W-B-pamela3_openface.csv,0,0.178387,0.004635,0.049683,0.061200,1.195680,1.242525,0.419095,1.071749,0.768604,0.261999,0.209442,0.556386,0.017700,0.003646,1.415482,0.357103,0.199328,ambiguous
1,2016-03-11_16-16-40-42-W-B-pamela3_openface.csv,1,0.272211,0.025835,0.060814,0.311568,1.307971,1.200932,0.558005,0.898411,0.400392,0.431994,0.415325,0.871914,0.512863,0.172006,0.552718,0.086503,0.455631,baseline_truth
2,2016-03-11_16-16-40-42-W-B-pamela3_openface.csv,2,0.317124,0.014803,0.022903,0.220756,1.143196,1.146096,0.186832,0.611617,0.373357,0.266364,0.258468,0.764018,0.173488,0.079466,0.380836,0.117277,0.275770,memory_recall
3,2016-03-11_16-16-40-42-W-B-pamela3_openface.csv,3,0.002513,0.000000,0.000000,1.466222,1.175159,0.188737,0.010810,0.494248,0.685809,0.203029,0.134933,0.713377,0.000959,0.035729,0.197267,0.111179,0.019289,cognitive
4,2016-03-11_16-16-40-42-W-B-pamela3_openface.csv,4,0.006746,0.000000,0.023269,0.000000,2.273398,2.182480,0.112045,1.032638,1.470397,0.473310,0.005836,0.832375,0.834122,0.055811,1.019679,0.594777,0.221349,uncomfortable
20,2016-03-15_18-15-13-468-W-T-Sprinkles_openface...,0,0.117715,0.071147,0.008875,0.063494,0.845443,0.000000,0.148413,1.562943,1.407170,1.406566,0.000000,0.020687,0.034060,0.000000,1.947890,0.301049,0.220813,ambiguous
21,2016-03-15_18-15-13-468-W-T-Sprinkles_openface...,1,0.305589,0.105979,0.016421,0.016969,0.503434,0.000379,0.065374,1.231465,0.948083,0.992866,0.001866,0.044544,0.022713,0.000294,2.138503,0.603647,0.092804,baseline_truth
22,2016-03-15_18-15-13-468-W-T-Sprinkles_openface...,2,0.073655,0.011640,0.000076,0.014194,0.782267,0.003441,0.106910,1.131270,0.917442,1.322814,0.064621,0.481917,0.014806,0.187415,1.211595,0.354210,0.081510,memory_recall
23,2016-03-15_18-15-13-468-W-T-Sprinkles_openface...,3,0.000000,0.000000,0.000000,0.181254,0.556658,0.000000,0.000000,0.726890,0.691916,1.436808,0.136765,1.004438,0.032381,0.012470,0.058188,0.018003,0.019195,cognitive
24,2016-03-15_18-15-13-468-W-T-Sprinkles_openface...,4,0.216964,0.046988,0.049695,0.031104,0.621037,0.000000,0.062262,1.017987,0.685544,1.344631,0.130293,0.627879,0.037679,0.304660,0.844138,0.239854,0.031893,uncomfortable


In [11]:
# Partition the data based on question label 
amb = df[(df['question_label'] == 'ambiguous')]
bt = df[(df['question_label'] == 'baseline_truth')]
mr = df[(df['question_label'] == 'memory_recall')]
cog = df[(df['question_label'] == 'cognitive')]
unc = df[(df['question_label'] == 'uncomfortable')]

df2 = pd.DataFrame([])
df3 = pd.DataFrame([])

# display average au levels for each type of baseline question
for d in [amb,bt,mr,cog,unc]:
    print('----')
    print(d.mean())
    df2 = df2.append(d.mean(), ignore_index=True)
    df3 = df3.append(d, ignore_index=True)

----
question    0.000000
AU01_r      0.377949
AU02_r      0.138922
AU04_r      0.504900
AU05_r      0.087904
AU06_r      0.507868
AU07_r      0.824777
AU09_r      0.128627
AU10_r      0.704011
AU12_r      0.633487
AU14_r      0.655125
AU15_r      0.128026
AU17_r      0.533901
AU20_r      0.102642
AU23_r      0.149494
AU25_r      0.959830
AU26_r      0.675961
AU45_r      0.406900
dtype: float64
----
question    1.000000
AU01_r      0.348787
AU02_r      0.119168
AU04_r      0.497193
AU05_r      0.132715
AU06_r      0.569429
AU07_r      0.860002
AU09_r      0.194437
AU10_r      0.751876
AU12_r      0.667065
AU14_r      0.707862
AU15_r      0.166938
AU17_r      0.593418
AU20_r      0.127039
AU23_r      0.217954
AU25_r      0.950824
AU26_r      0.646382
AU45_r      0.371118
dtype: float64
----
question    2.000000
AU01_r      0.384708
AU02_r      0.153478
AU04_r      0.500740
AU05_r      0.160131
AU06_r      0.554814
AU07_r      0.851231
AU09_r      0.153901
AU10_r      0.743696
AU12_r    

In [12]:
print(amb)

                                               Filename  question    AU01_r  \
0       2016-03-11_16-16-40-42-W-B-pamela3_openface.csv         0  0.178387   
20    2016-03-15_18-15-13-468-W-T-Sprinkles_openface...         0  0.117715   
46     2016-03-16_10-05-49-922-W-T-tarples_openface.csv         0  0.514009   
58       2016-06-07_12-20-38-706-W-T-pkaye_openface.csv         0  0.168293   
69    2016-06-08_13-59-09-521-W-B-sethdeco_openface.csv         0  0.064951   
78      2016-06-08_14-28-32-854-W-B-JimBob_openface.csv         0  0.162699   
83    2016-06-08_19-02-40-950-W-B-stars33hoc_openfac...         0  0.021891   
99       2016-06-09_09-05-50-814-W-T-chomp_openface.csv         0  0.257186   
109   2016-06-09_09-16-19-112-W-T-pacers101_openface...         0  0.214033   
116   2016-06-09_10-21-37-95-W-B-describer_openface.csv         0  0.004692   
126   2016-06-09_14-57-19-201-W-B-RobbyRoo_openface.csv         0  0.759732   
146     2016-06-10_11-15-58-679-W-T-Obes86_openface.

In [13]:
# Use the ANOVA one way test for each AU level
for c in df3.iloc[:,2:-1]:
    print(c)
    lm = ols('df3[c] ~ question',data=df3).fit()

    table = sm.stats.anova_lm(lm, typ=1) # Type 2 ANOVA DataFrame
    print(table)
    print('-------\n')

AU01_r


  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


             df      sum_sq   mean_sq         F    PR(>F)
question    1.0    0.081844  0.081844  0.443142  0.505839
Residual  670.0  123.742312  0.184690       NaN       NaN
-------

AU02_r
             df     sum_sq   mean_sq         F   PR(>F)
question    1.0   0.023023  0.023023  0.459953  0.49788
Residual  670.0  33.536840  0.050055       NaN      NaN
-------

AU04_r
             df      sum_sq   mean_sq         F    PR(>F)
question    1.0    0.011896  0.011896  0.024311  0.876144
Residual  670.0  327.860967  0.489345       NaN       NaN
-------

AU05_r
             df     sum_sq   mean_sq         F    PR(>F)
question    1.0   0.118136  0.118136  2.695868  0.101078
Residual  670.0  29.360258  0.043821       NaN       NaN
-------

AU06_r
             df      sum_sq   mean_sq         F    PR(>F)
question    1.0    0.050071  0.050071  0.183305  0.668686
Residual  670.0  183.015205  0.273157       NaN       NaN
-------

AU07_r
             df      sum_sq   mean_sq         F    PR(>F)
q

In [14]:
print(df2)

     AU01_r    AU02_r    AU04_r    AU05_r    AU06_r    AU07_r    AU09_r  \
0  0.377949  0.138922  0.504900  0.087904  0.507868  0.824777  0.128627   
1  0.348787  0.119168  0.497193  0.132715  0.569429  0.860002  0.194437   
2  0.384708  0.153478  0.500740  0.160131  0.554814  0.851231  0.153901   
3  0.377287  0.121854  0.488620  0.144240  0.495544  0.787949  0.149599   
4  0.324473  0.116618  0.494307  0.128135  0.513294  0.758544  0.153872   

     AU10_r    AU12_r    AU14_r    AU15_r    AU17_r    AU20_r    AU23_r  \
0  0.704011  0.633487  0.655125  0.128026  0.533901  0.102642  0.149494   
1  0.751876  0.667065  0.707862  0.166938  0.593418  0.127039  0.217954   
2  0.743696  0.628845  0.671651  0.189354  0.579411  0.112797  0.176843   
3  0.672088  0.580973  0.618082  0.145945  0.607179  0.123613  0.180206   
4  0.683592  0.644201  0.709391  0.146494  0.584188  0.137097  0.208611   

     AU25_r    AU26_r    AU45_r  question  
0  0.959830  0.675961  0.406900       0.0  
1  0.95082

In [15]:
ambAU01 = amb['AU01_r']
print(ambAU01)

0       0.178387
20      0.117715
46      0.514009
58      0.168293
69      0.064951
78      0.162699
83      0.021891
99      0.257186
109     0.214033
116     0.004692
126     0.759732
146     0.734379
157     0.635193
168     0.008072
173     0.060909
186     0.428735
197     0.020642
208     0.078447
225     0.075611
245     0.000000
276     0.351256
284     0.397565
291     0.536762
301     0.757283
309     0.751041
322     0.364989
337     0.151198
346     0.092985
357     0.062107
372     1.388837
          ...   
1503    0.564511
1518    0.051254
1548    0.185202
1563    1.058331
1573    2.624220
1574    0.211174
1583    0.000000
1592    0.585629
1608    0.058093
1624    0.007514
1642    0.274633
1663    0.327101
1683    0.177152
1692    0.000000
1706    0.000000
1713    0.000000
1723    0.933596
1750    0.422346
1760    0.289140
1779    0.000000
1797    0.000000
1806    0.439346
1840    4.175013
1850    1.211603
1870    0.107982
1881    0.198526
1896    0.184182
1905    0.2460

In [16]:
# Use the ANOVA one way test for each AU level

for c in df3.iloc[:,2:-1]:
    print(c)
    F, p = stats.f_oneway(amb[c],bt[c],mr[c],cog[c],unc[c])
    print(F,p)
    print('-------\n')
    
                     


AU01_r
0.465285894111 0.761243633126
-------

AU02_r
0.667106610707 0.614985374486
-------

AU04_r
0.0104499631283 0.999783988019
-------

AU05_r
2.26395431837 0.0608910266745
-------

AU06_r
0.505636591222 0.731618348618
-------

AU07_r
0.414872917235 0.79798589882
-------

AU09_r
1.43685987481 0.220126706612
-------

AU10_r
0.464257585081 0.761997286947
-------

AU12_r
0.403304244427 0.806330857974
-------

AU14_r
0.444273735634 0.776616297747
-------

AU15_r
1.67610536167 0.153741010307
-------

AU17_r
0.524042754606 0.718103855778
-------

AU20_r
1.1729749033 0.321455495655
-------

AU23_r
1.59062903528 0.175032483024
-------

AU25_r
2.86982480255 0.0224307150592
-------

AU26_r
0.953134092862 0.43270209701
-------

AU45_r
0.964090414621 0.426544194354
-------



In [17]:
# Now that we have a p-value for AU25 of < .05 use Tukey's test to see where
# the significance is coming from

df4 = df3[['AU25_r', 'question']].copy()

