In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt

In [4]:
#### Game Plan

#1 correlate length of memories/adjectives with relationship satisfaction and behav pref indices

#2 see if we can predict labels (parent/friend) from memories/adjectives

#3 run NNMF on data to see what latent topics emerge separately for parents and friends

#4 score each individual on topics according to word embeddings and see whether that predicts prefs

#5 use similarity between word embeddings to create networks (overall/topic focused) and test diff between par/fri

#6 look at differences between semantic networks, relate to prefs

In [3]:
pref = pd.read_csv("pref.csv")
parentText = pd.read_csv("parentText.csv")
friendText = pd.read_csv("friendText.csv")
    
print(pref)
print(parentText)
print(friendText)

        ID      alphaSGT  pref  sex  simParDist  simFriDist     parRQ  friRQ  \
0   PP9000  3.679717e-02     0    1    0.925367    0.887572  4.214286   4.76   
1   PP9001  6.755337e-01     1    1    0.571023    0.291086  3.857143   4.32   
2   PP9002  3.707466e-01     0    1    0.479710    0.169244  4.285714   4.36   
3   PP9003  3.589495e-01     0    0    0.473108    0.140112  3.750000   4.56   
4   PP9004  1.000000e+00     1    1    0.457621    0.319780  4.107143   3.52   
5   PP9005  7.483412e-01     1    1    0.036686    0.435534  3.678571   3.92   
6   PP9006  1.000000e-07     0    0    0.402852    0.345285  3.500000   4.16   
7   PP9007  1.000000e+00     1    0    0.954828    0.362378  4.214286   4.76   
8   PP9008  6.138874e-01     1    1    0.260051    0.612110  3.321429   4.64   
9   PP9009  6.744051e-01     1    0    0.129050    0.294755  4.000000   4.32   
10  PP9010  1.722825e-01     0    0    0.863174    0.694477  4.035714   4.40   
11  PP9011  9.174529e-01     1    1    0

In [4]:
countDat = np.zeros((4, len(pref["ID"])))
for s, sub in enumerate(pref["ID"]):
    countDat[0,s] = len(TextBlob(parentText.iloc[s,1]).words)
    countDat[1,s] = len(TextBlob(parentText.iloc[s,2]).words)
    countDat[2,s] = len(TextBlob(friendText.iloc[s,1]).words)
    countDat[3,s] = len(TextBlob(friendText.iloc[s,2]).words) 
    
    
# mem = TextBlob(parentText.iloc[0,1])
# len(mem.words)

In [5]:
countDat

array([[63., 20., 24., 36., 44., 32., 61., 35., 48., 52., 32., 25., 11.,
        41., 38., 35., 69., 29., 16., 40., 54., 31., 35., 45., 34., 48.,
        22., 48., 13., 21., 46., 16., 14., 54., 47., 51., 28., 12., 45.,
        36., 25., 32., 80., 54., 31., 49.,  9., 35., 23.],
       [17.,  3., 14.,  6.,  8.,  5., 33.,  5., 16., 26.,  6.,  4.,  6.,
         3.,  5.,  8., 59.,  7.,  6.,  9., 11.,  5.,  6., 21., 22.,  6.,
         9.,  5.,  5.,  4., 13.,  3.,  8., 24., 29., 23.,  7.,  5., 10.,
        17., 10.,  4., 10.,  8.,  3., 20.,  7., 58., 11.],
       [53., 19., 30., 21., 25., 41., 51., 35., 64., 82., 31., 20., 15.,
        69., 69., 35., 37., 34., 46., 53., 45., 32., 34., 60., 35., 41.,
        13., 40., 15., 12., 54., 26., 30., 53., 47., 66., 28., 14., 59.,
        20., 25., 34., 96., 46., 46., 53., 16., 39., 51.],
       [12.,  5.,  9.,  6.,  5.,  5., 21.,  5., 16., 29.,  6.,  4.,  8.,
         3.,  4.,  6., 25.,  5.,  5., 10.,  9.,  4.,  5., 25., 16.,  6.,
         8.,  3.,  4

In [6]:
pref = pd.merge(pref,pd.DataFrame({'ID': parentText["ID"], 'parMemLen': countDat[0,:], 'parAdjLen': countDat[1,:], 'friMemLen': countDat[2,:], 'friAdjLen': countDat[3,:]}), on = "ID")

In [7]:
#plt.matshow(pref[['alphaSGT', 'parRQ', 'friRQ', 'parMemLen', 'parAdjLen', 'friMemLen', 'friAdjLen']].corr())
#plt.show()
pref[['alphaSGT', 'parRQ', 'friRQ', 'parMemLen', 'parAdjLen', 'friMemLen', 'friAdjLen']].corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,alphaSGT,parRQ,friRQ,parMemLen,parAdjLen,friMemLen,friAdjLen
alphaSGT,1.0,-0.083006,-0.215394,-0.245346,-0.227697,-0.110367,-0.234052
parRQ,-0.083006,1.0,0.367806,-0.180139,-0.0447143,-0.085258,-0.0830715
friRQ,-0.215394,0.367806,1.0,-0.157202,-0.040946,-0.0270965,0.000797512
parMemLen,-0.245346,-0.180139,-0.157202,1.0,0.473149,0.716936,0.397005
parAdjLen,-0.227697,-0.0447143,-0.040946,0.473149,1.0,0.245295,0.87135
friMemLen,-0.110367,-0.085258,-0.0270965,0.716936,0.245295,1.0,0.34276
friAdjLen,-0.234052,-0.0830715,0.000797512,0.397005,0.87135,0.34276,1.0


In [20]:
allMem = []
allAdj = []

for s,sub in enumerate(pref['ID']):
    allMem.append((parentText.iloc[s,1], 'par'))
    allMem.append((friendText.iloc[s,1], 'fri'))
    allAdj.append((parentText.iloc[s,2], 'par'))
    allAdj.append((friendText.iloc[s,2], 'fri'))

In [40]:
# from sklearn.model_selection import ShuffleSplit
# splits = ShuffleSplit(n_splits = 5, test_size=.2, random_state=0)
# #splits.get_n_splits(allMem)
# for train_index, test_index in splits.split(allMem):
#     #print("TRAIN:", train_index)
#     print(allMem[train_index])
#     #print("TEST:", test_index)

TypeError: only integer scalar arrays can be converted to a scalar index

In [37]:
from textblob.classifiers import NaiveBayesClassifier

In [39]:
scoresMem = []
scoresAjd = []

for train_index, test_index in splits.split(allMem):
    cl = NaiveBayesClassifier(allMem[train_index])
    

0
1
2
3
4
