In [92]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt

In [4]:
#### Game Plan

#1 correlate length of memories/adjectives with relationship satisfaction and behav pref indices

#2 see if we can predict labels (parent/friend) from memories/adjectives

#3 run NNMF on data to see what latent topics emerge separately for parents and friends

#4 score each individual on topics according to word embeddings and see whether that predicts prefs

#5 use similarity between word embeddings to create networks (overall/topic focused) and test diff between par/fri

#6 look at differences between semantic networks, relate to prefs

In [105]:
pref = pd.read_csv("pref.csv")
parentText = pd.read_csv("parentText.csv")
friendText = pd.read_csv("friendText.csv")
    
print(pref)
print(parentText)
print(friendText)

        ID      alphaSGT  pref  sex  simParDist  simFriDist     parRQ  friRQ  \
0   PP9000  3.679717e-02     0    1    0.925367    0.887572  4.214286   4.76   
1   PP9001  6.755337e-01     1    1    0.571023    0.291086  3.857143   4.32   
2   PP9002  3.707466e-01     0    1    0.479710    0.169244  4.285714   4.36   
3   PP9003  3.589495e-01     0    0    0.473108    0.140112  3.750000   4.56   
4   PP9004  1.000000e+00     1    1    0.457621    0.319780  4.107143   3.52   
5   PP9005  7.483412e-01     1    1    0.036686    0.435534  3.678571   3.92   
6   PP9006  1.000000e-07     0    0    0.402852    0.345285  3.500000   4.16   
7   PP9007  1.000000e+00     1    0    0.954828    0.362378  4.214286   4.76   
8   PP9008  6.138874e-01     1    1    0.260051    0.612110  3.321429   4.64   
9   PP9009  6.744051e-01     1    0    0.129050    0.294755  4.000000   4.32   
10  PP9010  1.722825e-01     0    0    0.863174    0.694477  4.035714   4.40   
11  PP9011  9.174529e-01     1    1    0

In [106]:
countDat = np.zeros((4, len(pref["ID"])))
for s, sub in enumerate(pref["ID"]):
    countDat[0,s] = len(TextBlob(parentText.iloc[s,1]).words)
    countDat[1,s] = len(TextBlob(parentText.iloc[s,2]).words)
    countDat[2,s] = len(TextBlob(friendText.iloc[s,1]).words)
    countDat[3,s] = len(TextBlob(friendText.iloc[s,2]).words) 
    
    
# mem = TextBlob(parentText.iloc[0,1])
# len(mem.words)

In [95]:
countDat

array([[63., 20., 24., 36., 44., 32., 61., 35., 48., 52., 32., 25., 11.,
        41., 38., 35., 69., 29., 16., 40., 54., 31., 35., 45., 34., 48.,
        22., 48., 13., 21., 46., 16., 14., 54., 47., 51., 28., 12., 45.,
        36., 25., 32., 80., 54., 31., 49.,  9., 35., 23.],
       [17.,  3., 14.,  6.,  8.,  5., 33.,  5., 16., 26.,  6.,  4.,  6.,
         3.,  5.,  8., 59.,  7.,  6.,  9., 11.,  5.,  6., 21., 22.,  6.,
         9.,  5.,  5.,  4., 13.,  3.,  8., 24., 29., 23.,  7.,  5., 10.,
        17., 10.,  4., 10.,  8.,  3., 20.,  7., 58., 11.],
       [53., 19., 30., 21., 25., 41., 51., 35., 64., 82., 31., 20., 15.,
        69., 69., 35., 37., 34., 46., 53., 45., 32., 34., 60., 35., 41.,
        13., 40., 15., 12., 54., 26., 30., 53., 47., 66., 28., 14., 59.,
        20., 25., 34., 96., 46., 46., 53., 16., 39., 51.],
       [12.,  5.,  9.,  6.,  5.,  5., 21.,  5., 16., 29.,  6.,  4.,  8.,
         3.,  4.,  6., 25.,  5.,  5., 10.,  9.,  4.,  5., 25., 16.,  6.,
         8.,  3.,  4

In [107]:
pref = pd.merge(pref,pd.DataFrame({'ID': parentText["ID"], 'parMemLen': countDat[0,:], 'parAdjLen': countDat[1,:], 'friMemLen': countDat[2,:], 'friAdjLen': countDat[3,:]}), on = "ID")

In [108]:
#plt.matshow(pref[['alphaSGT', 'parRQ', 'friRQ', 'parMemLen', 'parAdjLen', 'friMemLen', 'friAdjLen']].corr())
#plt.show()
pref[['alphaSGT', 'parRQ', 'friRQ', 'parMemLen', 'parAdjLen', 'friMemLen', 'friAdjLen']].corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,alphaSGT,parRQ,friRQ,parMemLen,parAdjLen,friMemLen,friAdjLen
alphaSGT,1.0,-0.083006,-0.215394,-0.245346,-0.227697,-0.110367,-0.234052
parRQ,-0.083006,1.0,0.367806,-0.180139,-0.0447143,-0.085258,-0.0830715
friRQ,-0.215394,0.367806,1.0,-0.157202,-0.040946,-0.0270965,0.000797512
parMemLen,-0.245346,-0.180139,-0.157202,1.0,0.473149,0.716936,0.397005
parAdjLen,-0.227697,-0.0447143,-0.040946,0.473149,1.0,0.245295,0.87135
friMemLen,-0.110367,-0.085258,-0.0270965,0.716936,0.245295,1.0,0.34276
friAdjLen,-0.234052,-0.0830715,0.000797512,0.397005,0.87135,0.34276,1.0


In [109]:
allMem = []
allAdj = []

for s,sub in enumerate(pref['ID']):
    allMem.append((parentText.iloc[s,1], 'par'))
    allMem.append((friendText.iloc[s,1], 'fri'))
    allAdj.append((parentText.iloc[s,2], 'par'))
    allAdj.append((friendText.iloc[s,2], 'fri'))

In [110]:
from textblob.classifiers import NaiveBayesClassifier
import random

In [235]:
scoresMem = []
scoresAdj = []

for k in range(5):
    random.seed(k)
    random.shuffle(allMem)
    random.shuffle(allAdj)
    clMem = NaiveBayesClassifier(allMem[0:int(.8 * len(allMem))])
    scoresMem.append(clMem.accuracy(allMem[int(.8 * len(allMem)):]))
    clAdj = NaiveBayesClassifier(allAdj[0:int(.8 * len(allAdj))])
    scoresAdj.append(clAdj.accuracy(allAdj[int(.8 * len(allAdj)):]))

In [236]:
print(scoresMem)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(scoresMem, axis = 0), (np.std(scoresMem,axis=0) * 2)))
clMem.show_informative_features(50)

[0.8, 0.85, 0.65, 0.95, 0.7]
Accuracy: 0.79 (+/- 0.21)
Most Informative Features
          contains(When) = True              par : fri    =      5.1 : 1.0
           contains(she) = True              par : fri    =      4.9 : 1.0
            contains(do) = True              par : fri    =      4.5 : 1.0
       contains(friends) = True              fri : par    =      4.1 : 1.0
      contains(favorite) = True              par : fri    =      3.3 : 1.0
        contains(around) = True              fri : par    =      3.3 : 1.0
            contains(We) = True              fri : par    =      3.3 : 1.0
            contains(my) = True              par : fri    =      3.2 : 1.0
           contains(She) = True              par : fri    =      3.1 : 1.0
             contains(I) = False             fri : par    =      3.0 : 1.0
            contains(it) = True              fri : par    =      3.0 : 1.0
      contains(together) = True              fri : par    =      2.9 : 1.0
            contain

In [237]:
print(scoresAdj)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(scoresAdj, axis = 0), (np.std(scoresAdj,axis=0) * 2)))
clAdj.show_informative_features(50)

[0.6, 0.75, 0.65, 0.75, 0.5]
Accuracy: 0.65 (+/- 0.19)
Most Informative Features
           contains(the) = True              par : fri    =      5.5 : 1.0
        contains(loving) = True              par : fri    =      4.3 : 1.0
           contains(and) = True              par : fri    =      4.1 : 1.0
   contains(trustworthy) = True              fri : par    =      3.9 : 1.0
           contains(she) = True              par : fri    =      3.3 : 1.0
            contains(my) = True              par : fri    =      3.3 : 1.0
   contains(hardworking) = True              par : fri    =      3.3 : 1.0
      contains(generous) = True              par : fri    =      2.9 : 1.0
      contains(feelings) = True              par : fri    =      2.6 : 1.0
         contains(loves) = True              par : fri    =      2.6 : 1.0
           contains(lot) = True              par : fri    =      2.6 : 1.0
           contains(but) = True              par : fri    =      2.6 : 1.0
            contain

In [239]:
#shuffle particpants instead of individual meories/adjectives. 
#doing the latter might end up in train-test leakage. 
scoresMemAlt = []
scoresAdjAlt = []
id = pref['ID']

for k in range(5):
    random.seed(k)
    random.shuffle(id)
    allMemAltTest = []
    allMemAltTrain = []
    allAdjAltTest = []
    allAdjAltTrain = []
    for s,sub in enumerate(id[0:int(.8 * len(id))]):
        allMemAltTest.append((parentText.loc[sub,'memory'], 'par'))
        allMemAltTest.append((friendText.loc[sub,'memory'], 'fri'))
        allAdjAltTest.append((parentText.loc[sub,'adjectives'], 'par'))
        allAdjAltTest.append((friendText.loc[sub,'adjectives'], 'fri'))
    for s,sub in enumerate(id[int(.8 * len(id)):]):
        allMemAltTrain.append((parentText.loc[sub,'memory'], 'par'))
        allMemAltTrain.append((friendText.loc[sub,'memory'], 'fri'))
        allAdjAltTrain.append((parentText.loc[sub,'adjectives'], 'par'))
        allAdjAltTrain.append((friendText.loc[sub,'adjectives'], 'fri'))  
    clMemAlt = NaiveBayesClassifier(allMemAltTrain)
    scoresMemAlt.append(clMemAlt.accuracy(allMemAltTest))
    clAdjAlt = NaiveBayesClassifier(allAdjAltTrain)
    scoresAdjAlt.append(clAdjAlt.accuracy(allAdjAltTest))
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i], x[j] = x[j], x[i]


In [240]:
print(scoresMemAlt)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(scoresMemAlt, axis = 0), (np.std(scoresMemAlt,axis=0) * 2)))
clMemAlt.show_informative_features(50)

[0.717948717948718, 0.7692307692307693, 0.782051282051282, 0.7051282051282052, 0.7051282051282052]
Accuracy: 0.74 (+/- 0.07)
Most Informative Features
           contains(our) = True              fri : par    =      3.0 : 1.0
            contains(my) = False             fri : par    =      3.0 : 1.0
            contains(at) = True              fri : par    =      2.6 : 1.0
           contains(out) = True              fri : par    =      2.3 : 1.0
          contains(with) = True              par : fri    =      2.3 : 1.0
           contains(and) = False             par : fri    =      2.3 : 1.0
        contains(dinner) = True              fri : par    =      2.3 : 1.0
            contains(by) = True              par : fri    =      2.3 : 1.0
            contains(at) = False             par : fri    =      1.9 : 1.0
          contains(went) = True              fri : par    =      1.8 : 1.0
            contains(it) = True              fri : par    =      1.8 : 1.0
           contains(One)

In [242]:
print(scoresAdjAlt)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(scoresAdjAlt, axis = 0), (np.std(scoresAdjAlt,axis=0) * 2)))
clAdj.show_informative_features(50)

[0.6794871794871795, 0.6282051282051282, 0.5128205128205128, 0.5641025641025641, 0.5897435897435898]
Accuracy: 0.59 (+/- 0.11)
Most Informative Features
           contains(the) = True              par : fri    =      5.5 : 1.0
        contains(loving) = True              par : fri    =      4.3 : 1.0
           contains(and) = True              par : fri    =      4.1 : 1.0
   contains(trustworthy) = True              fri : par    =      3.9 : 1.0
           contains(she) = True              par : fri    =      3.3 : 1.0
            contains(my) = True              par : fri    =      3.3 : 1.0
   contains(hardworking) = True              par : fri    =      3.3 : 1.0
      contains(generous) = True              par : fri    =      2.9 : 1.0
      contains(feelings) = True              par : fri    =      2.6 : 1.0
         contains(loves) = True              par : fri    =      2.6 : 1.0
           contains(lot) = True              par : fri    =      2.6 : 1.0
           contains(bu

In [263]:
pref['RQ_diff'] = pref['parRQ'].sub(pref['friRQ'], axis = 0)

Unnamed: 0,ID,alphaSGT,pref,sex,simParDist,simFriDist,parRQ,friRQ,out,parMemLen,parAdjLen,friMemLen,friAdjLen,RQ_diff
0,PP9039,0.03679717,0,1,0.925367,0.887572,4.214286,4.76,0,63.0,17.0,53.0,12.0,-0.545714
1,PP9001,0.6755337,1,1,0.571023,0.291086,3.857143,4.32,0,20.0,3.0,19.0,5.0,-0.462857
2,PP9015,0.3707466,0,1,0.47971,0.169244,4.285714,4.36,0,24.0,14.0,30.0,9.0,-0.074286
3,PP9044,0.3589495,0,0,0.473108,0.140112,3.75,4.56,0,36.0,6.0,21.0,6.0,-0.81
4,PP9048,1.0,1,1,0.457621,0.31978,4.107143,3.52,0,44.0,8.0,25.0,5.0,0.587143
5,PP9012,0.7483412,1,1,0.036686,0.435534,3.678571,3.92,0,32.0,5.0,41.0,5.0,-0.241429
6,PP9006,1e-07,0,0,0.402852,0.345285,3.5,4.16,0,61.0,33.0,51.0,21.0,-0.66
7,PP9034,1.0,1,0,0.954828,0.362378,4.214286,4.76,0,35.0,5.0,35.0,5.0,-0.545714
8,PP9038,0.6138874,1,1,0.260051,0.61211,3.321429,4.64,0,48.0,16.0,64.0,16.0,-1.318571
9,PP9004,0.6744051,1,0,0.12905,0.294755,4.0,4.32,0,52.0,26.0,82.0,29.0,-0.32


In [266]:
test = pref
test['RQ_diff_cat'] = pd.cut(pref['RQ_diff'], 3)
test

Unnamed: 0,ID,alphaSGT,pref,sex,simParDist,simFriDist,parRQ,friRQ,out,parMemLen,parAdjLen,friMemLen,friAdjLen,RQ_diff,RQ_diff_cat
0,PP9039,0.03679717,0,1,0.925367,0.887572,4.214286,4.76,0,63.0,17.0,53.0,12.0,-0.545714,"(-1.032, -0.0224]"
1,PP9001,0.6755337,1,1,0.571023,0.291086,3.857143,4.32,0,20.0,3.0,19.0,5.0,-0.462857,"(-1.032, -0.0224]"
2,PP9015,0.3707466,0,1,0.47971,0.169244,4.285714,4.36,0,24.0,14.0,30.0,9.0,-0.074286,"(-1.032, -0.0224]"
3,PP9044,0.3589495,0,0,0.473108,0.140112,3.75,4.56,0,36.0,6.0,21.0,6.0,-0.81,"(-1.032, -0.0224]"
4,PP9048,1.0,1,1,0.457621,0.31978,4.107143,3.52,0,44.0,8.0,25.0,5.0,0.587143,"(-0.0224, 0.987]"
5,PP9012,0.7483412,1,1,0.036686,0.435534,3.678571,3.92,0,32.0,5.0,41.0,5.0,-0.241429,"(-1.032, -0.0224]"
6,PP9006,1e-07,0,0,0.402852,0.345285,3.5,4.16,0,61.0,33.0,51.0,21.0,-0.66,"(-1.032, -0.0224]"
7,PP9034,1.0,1,0,0.954828,0.362378,4.214286,4.76,0,35.0,5.0,35.0,5.0,-0.545714,"(-1.032, -0.0224]"
8,PP9038,0.6138874,1,1,0.260051,0.61211,3.321429,4.64,0,48.0,16.0,64.0,16.0,-1.318571,"(-2.044, -1.032]"
9,PP9004,0.6744051,1,0,0.12905,0.294755,4.0,4.32,0,52.0,26.0,82.0,29.0,-0.32,"(-1.032, -0.0224]"
