In [None]:
#             Notebook #3
#                       Join data and perform statistical tests. 
#

#  This Notebook will:
#  1. Read session scores from the Affiliation data, LIWC category counts 
#     Part of Speech frequencies and transitions. 
#  2. Perform feature selection and do some statistical tests.
#  3. Plot some results
#
#  Caveat Lector: The notebook is structured for our file system structure and input data types. It will
#                 require some refactoring to run on other structures.

In [None]:
#  Imports
import os
import ssl
import io
import sys
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler
from sklearn import feature_selection
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFECV
from statsmodels.stats.multitest import fdrcorrection
from sklearn.decomposition import FastICA, PCA

from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn import preprocessing


 #    Options
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#Input variables
dyadScoresFileName = '/Users/Heisig/Jihan/Results/clinical_scores_dyad.csv'
liwcTranscriptFileName = '/Users/Heisig/Jihan/LIWC/LIWC2015_Results_SplitTranscripts.csv'
targetFilenames = ['/Users/Heisig/Jihan/Results/target_you_know.csv','/Users/Heisig/Jihan/Results/target_so.csv','/Users/Heisig/Jihan/Results/target_i_said.csv','/Users/Heisig/Jihan/Results/target_you_said.csv']
honoreBrunetFileName = '/Users/Heisig/Jihan/Results/honoreBrunetDF.csv'
pronounFileName = '/Users/Heisig/Jihan/Results/allPronounDF.csv'
ngramFileName = '/Users/Heisig/Jihan/Results/ngramFreqNorms.csv'
posTranFileName = '/Users/Heisig/Jihan/Results/allPOStransDF.csv'
allEmbeddingFileName = '/Users/Heisig/Jihan/Results/AllEmbeddings.csv'
cosInterFileName = '/Users/Heisig/Jihan/Results/interCosDF.csv'
cosIndFileName = '/Users/Heisig/Jihan/Results/indCosDF.csv'
cosBothFileName = '/Users/Heisig/Jihan/Results/bothCosDF.csv'

#Output
coerrFileName = '/Users/Heisig/Jihan/Results/coerrCoeffs.csv'
liwcDyadFileName = '/Users/Heisig/Jihan/Results/liwcDyadData.csv'
metricsFileName = '/Users/Heisig/Jihan/Results/allMetricsPvals.csv'

In [None]:
   import pkg_resources
   pkg_resources.get_distribution("statsmodels").version

In [None]:
#   Functions

def plot_coercoeff(ccaDF):
  fig = plt.gcf()
  fig.set_size_inches(4, 36)
  #plt.gcf().subplots_adjust(bottom=0.15)
  plt.tight_layout()
  sns.set(font_scale=0.8)
  g = sns.heatmap(
      ccaDF,
      xticklabels=ccaDF.columns,
      yticklabels=ccaDF.index,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.xaxis.set_label_position('top')
  g.xaxis.tick_top() 
  plt.xticks(rotation=90) 
  g.set_title("CCA Coeff") 
  plt.show()
 

def ftestRegressionVariable(analyzeDF,responseVariable):
    #Compute an F-Test
    print('ftestRegressionVariable analyzeDF.columns: \n',list(analyzeDF.columns))
    model = feature_selection.SelectKBest(score_func=feature_selection.f_regression, k='all')
    results = model.fit(analyzeDF, responseVariable)
    #print(results.scores_)
    #print(results.pvalues_)
    localFeatlist = analyzeDF.columns.tolist()
    statsDF = pd.DataFrame( list(zip(localFeatlist, results.scores_, results.pvalues_)) ,columns=['feature', 'F-TestScore', 'FT-p-val']) 
    statsDF.sort_values(by=['FT-p-val'],axis=0,inplace=True)
    
    #Cutoff features by F-Test P-val
    #  Returning all for FDR purposes...
    #statsDF = statsDF.loc[statsDF['FT-p-val'] < 0.05]
    #Reindex after sort
    statsDF  = statsDF.reset_index(drop=True)
    #print(statsDF)
    
    #Return the sorted F-test statistics and their p-value for each feature
    return(statsDF)    

In [None]:
#   Read the various features and scores into a single dataframe
#
#   Features:
#        LIWC
#        Target phrase data (from Jihan POS)
#        Pronouns
#        POS
#        Custom structures

#Read Dyad scores
dyadDF = pd.read_csv(dyadScoresFileName) 

print('dyadDF.shape: \n',dyadDF.shape)
#print('dyadDF: \n',dyadDF)

dyadDF.rename(columns = {'Date':'Session'},inplace=True) 
#dyadDF.drop('Diagnosis', axis=1, inplace=True)
dyadDF.drop('Therapist', axis=1, inplace=True)
dyadDF.drop('Patient', axis=1, inplace=True)  
print('dyadDF.shape: \n',dyadDF.shape)
#print('dyadDF: \n',dyadDF)
dyadCols = dyadDF.columns

#Read the embedding file
if os.path.isfile(allEmbeddingFileName):
   allEmbeddingsDF = pd.read_csv(allEmbeddingFileName)
else:
   print('Embedding file not found: ', allEmbeddingFileName) 

#Read Specific target phrase data (from Jihan POS)
targetDF = pd.read_csv(targetFilenames[0],sep=',')
targetDF.drop(['SentenceList'],axis=1,inplace=True)
for targetFilename in targetFilenames[1:]:
    target2DF = pd.read_csv(targetFilename,sep=',')
    target2DF.drop(['SentenceList'],axis=1,inplace=True)
    targetDF = pd.merge(targetDF,target2DF, on=['Speaker','Session'])
    print('targetDF.shape: \n',targetDF.shape)
therapistTargetDF = targetDF[targetDF['Speaker']=='therapist'].copy()
therapistTargetDF.reset_index(inplace=True,drop=True)
print('therapistTargetDF.shape: \n',therapistTargetDF.shape)
patientTargetDF = targetDF[targetDF['Speaker']=='patient'].copy()
patientTargetDF.reset_index(inplace=True,drop=True)
print('patientTargetDF.shape: \n',patientTargetDF.shape)

#Read Honore Brunet
hbDF = pd.read_csv(honoreBrunetFileName,sep=',')
print('hbDF.shape: \n',hbDF.shape)
therapistHbDF = hbDF[hbDF['Speaker']=='therapist'].copy()
therapistHbDF.reset_index(inplace=True,drop=True)
print('therapistHbDF.shape: \n',therapistHbDF.shape)
patientHbDF = hbDF[hbDF['Speaker']=='patient'].copy()
patientHbDF.reset_index(inplace=True,drop=True)
print('patientHbDF.shape: \n',patientHbDF.shape)

#Read Pronoun features
pnDF = pd.read_csv(pronounFileName,sep=',')
print('pnDF.shape: \n',pnDF.shape)
therapistPnDF = pnDF[pnDF['Speaker']=='therapist'].copy()
therapistPnDF.reset_index(inplace=True,drop=True)
print('therapistPnDF.shape: \n',therapistPnDF.shape)
patientPnDF = pnDF[pnDF['Speaker']=='patient'].copy()
patientPnDF.reset_index(inplace=True,drop=True)
print('patientPnDF.shape: \n',patientPnDF.shape)

#Read POS transition features
posDF = pd.read_csv(posTranFileName,sep=',')
print('posDF.shape: \n',posDF.shape)
therapistPosDF = posDF[posDF['Speaker']=='therapist'].copy()
therapistPosDF.reset_index(inplace=True,drop=True)
print('therapistPosDF.shape: \n',therapistPosDF.shape)
patientPosDF = posDF[posDF['Speaker']=='patient'].copy()
patientPosDF.reset_index(inplace=True,drop=True)
print('patientPosDF.shape: \n',patientPosDF.shape)

#Read ngram features
ngramDF = pd.read_csv(ngramFileName,sep=',')
print('ngramDF.shape: \n',ngramDF.shape)
therapistNgramDF = ngramDF[ngramDF['Speaker']=='therapist'].copy()
therapistNgramDF.reset_index(inplace=True,drop=True)
print('therapistNgramDF: \n',therapistNgramDF)
patientNgramDF = ngramDF[ngramDF['Speaker']=='patient'].copy()
patientNgramDF.reset_index(inplace=True,drop=True)
print('patientNgramDF.shape: \n',patientNgramDF.shape)


#                  Cosine Similarity features

#Read Cosine transition features
cosInterDF = pd.read_csv(cosInterFileName,sep=',')
print('cosInterDF.shape: \n',cosInterDF.shape)
therapistInterCosDF = cosInterDF[cosInterDF['Speaker']=='therapist'].copy()
therapistInterCosDF.reset_index(inplace=True,drop=True)
print('therapistInterCosDF.shape: \n',therapistInterCosDF.shape)
patientInterCosDF = cosInterDF[cosInterDF['Speaker']=='patient'].copy()
patientInterCosDF.reset_index(inplace=True,drop=True)
print('patientInterCosDF.shape: \n',patientInterCosDF.shape)

#Read Cosine both features
cosBothDF = pd.read_csv(cosBothFileName,sep=',')
print('cosBothDF.shape: \n',cosBothDF.shape)
therapistBothDF = cosBothDF[cosBothDF['Speaker']=='therapist'].copy()
therapistBothDF.reset_index(inplace=True,drop=True)
print('therapistBothDF.shape: \n',therapistBothDF.shape)
patientBothDF = cosBothDF[cosBothDF['Speaker']=='therapist'].copy()
patientBothDF.reset_index(inplace=True,drop=True)
print('patientBothDF.shape: \n',patientBothDF.shape)

#Read Cosine both transition features
posIndDF = pd.read_csv(cosIndFileName,sep=',')
print('posIndDF.shape: \n',posIndDF.shape)
therapistIndDF = posIndDF[posIndDF['Speaker']=='therapist'].copy()
therapistIndDF.reset_index(inplace=True,drop=True)
print('therapistIndDF.shape: \n',therapistIndDF.shape)
patientIndDF = posIndDF[posIndDF['Speaker']=='patient'].copy()
patientIndDF.reset_index(inplace=True,drop=True)
print('patientIndDF.shape: \n',patientIndDF.shape)

#Read LIWC category percentages
liwcDF = pd.read_csv(liwcTranscriptFileName,sep=',')
print('liwcDF.shape: \n',liwcDF.shape)
liwcDF.sort_values(by=['Filename'],inplace=True)
liwcDF.reset_index(inplace=True,drop=True)
#print('liwcDF: \n',liwcDF)

#LIWC fixup and concatenation of other feature DFs
patientDF = liwcDF[liwcDF['Filename'].str.contains('patient')].copy()
patientDF.reset_index(inplace=True,drop=True)
SessionSeriesDF = patientDF['Filename'].str.split('_',1,expand=True)
patientDF['Session'] = SessionSeriesDF[0]
patientDF.drop('Filename', axis=1, inplace=True)
patientDF['Session'] = patientDF['Session'].astype('int64')
patientDF = pd.merge(patientDF,patientTargetDF.drop(['Speaker'], axis=1), on='Session')
patientDF = pd.merge(patientDF,patientHbDF.drop(['Speaker'], axis=1), on='Session')
patientDF = pd.merge(patientDF,patientPnDF.drop(['Speaker'], axis=1), on='Session')
patientDF = pd.merge(patientDF,patientPosDF.drop(['Speaker'], axis=1), on='Session')
patientDF = pd.merge(patientDF,patientNgramDF.drop(['Speaker'], axis=1), on='Session')
patientDF = patientDF.add_prefix('Patient_')
patientDF.rename(columns = {'Patient_Session':'Session'},inplace=True)
print('patientDF: \n',patientDF.shape)
#print('patientDF: \n',patientDF.head())

therapistDF = liwcDF[liwcDF['Filename'].str.contains('therapist')].copy()
therapistDF.reset_index(inplace=True,drop=True)
SessionSeriesDF = therapistDF['Filename'].str.split('_',1,expand=True)
therapistDF['Session'] = SessionSeriesDF[0]
therapistDF.drop('Filename', axis=1, inplace=True)
therapistDF['Session'] = therapistDF['Session'].astype('int64')
therapistDF = pd.merge(therapistDF,therapistTargetDF.drop(['Speaker'], axis=1), on='Session')
therapistDF = pd.merge(therapistDF,therapistHbDF.drop(['Speaker'], axis=1), on='Session')
therapistDF = pd.merge(therapistDF,therapistPnDF.drop(['Speaker'], axis=1), on='Session')
therapistDF = pd.merge(therapistDF,therapistPosDF.drop(['Speaker'], axis=1), on='Session')
therapistDF = pd.merge(therapistDF,therapistNgramDF.drop(['Speaker'], axis=1), on='Session')
therapistDF = therapistDF.add_prefix('Therapist_')
therapistDF.rename(columns = {'Therapist_Session':'Session'},inplace=True)
print('therapistDF.shape: \n',therapistDF.shape)
#print('therapistDF: \n',therapistDF.head())

#Merge the two datasets on session (this needs to be more unique to avoid multiple sessions in a day issues)
#Doing this to make sure we are really aligned before splitting into X and Y
totalLIWCDF = pd.merge(patientDF,therapistDF, on='Session')
totalLIWCDF['Session'] = totalLIWCDF['Session'].astype('int64')
print('totalLIWCDF.shape: \n',totalLIWCDF.shape)
#totalLIWCDF = pd.merge(totalLIWCDF,bothDF, on='Session')
totalDF = pd.merge(totalLIWCDF, dyadDF, on='Session') 

totalDF.set_index('Session', inplace = True)
print('totalDF: ',totalDF.shape)
#print('totalDF: ',totalDF.head())

#Update col lists to reflect dropped columns in totalDF
dyadCols = list(set(dyadCols) & set(totalDF.columns))
#print('dyadCols: ',dyadCols)
liwcCols = list(totalLIWCDF.columns)
#print('liwcCols: ',liwcCols)
totalDF.to_csv(liwcDyadFileName)

In [None]:
# Start with the joined DF created in this NoteBook: Jihan LIWC per Turn and Stanza Testing
 
fileName = liwcDyadFileName
#Read joined LIWC, Clinical, and Text data
totalDF = pd.read_csv(fileName) 
print('totalDF.shape: \n',totalDF.shape)
#totalDF.drop('Modality', axis=1, inplace=True)
#totalDF.drop('Therapist', axis=1,inplace=True)
#totalDF.drop('Patient', axis=1,inplace=True)
#totalDF.drop('Diagnosis', axis=1,inplace=True)
#totalDF.drop('Speaker', axis=1,inplace=True)
#totalDF.drop('Filename', axis=1,inplace=True)
#totalDF.drop('Unnamed: 19', axis=1,inplace=True)

#Read Dyad scores
clinicalDF = pd.read_csv(dyadScoresFileName) 
print('clinicalDF.shape: \n',clinicalDF.shape)
print('clinicalDF: \n',clinicalDF)

clinicalDF.rename(columns = {'Date':'Session'},inplace=True) 
#clinicalDF.drop('Diagnosis', axis=1, inplace=True)
#clinicalDF.drop('Therapist', axis=1, inplace=True)
#clinicalDF.drop('Patient', axis=1, inplace=True)
clinicalDF.drop('Session', axis=1, inplace=True)
print('clinicalDF.shape: \n',clinicalDF.shape)
print('clinicalDF: \n',clinicalDF)
clinicalCols = clinicalDF.columns
clinicalCols

In [None]:
totalDF.shape

In [None]:

#                                              Find the first X PCs of the most differentially expressed features 
components = 5
pca = PCA(n_components=components)
pca.fit(posDF.drop(['Session','Speaker'], axis=1))
#Show explained variance
print('PC explained_variance_ratios: \n',pca.explained_variance_ratio_)
print('PC singular_values: \n',pca.singular_values_) 
print('PC components: \n',pca.components_)
pcDF = pd.DataFrame(pca.transform(inputFeatureDF),columns=['MIPC1','MIPC2','MIPC3'])
finalDF = pd.concat([posDF, pcDF], axis=1)


In [None]:
#Setup variables for CCA
liwcCols = ['WC',
 'Analytic',
 'Clout',
 'Authentic',
 'Tone',
 'WPS',
 'Sixltr',
 'Dic',
 'function',
 'pronoun',
 'ppron',
 'i',
 'we',
 'you',
 'shehe',
 'they',
 'ipron',
 'article',
 'prep',
 'auxverb',
 'adverb',
 'conj',
 'negate',
 'verb',
 'adj',
 'compare',
 'interrog',
 'number',
 'quant',
 'affect',
 'posemo',
 'negemo',
 'anx',
 'anger',
 'sad',
 'social',
 'family',
 'friend',
 'female',
 'male',
 'cogproc',
 'insight',
 'cause',
 'discrep',
 'tentat',
 'certain',
 'differ',
 'percept',
 'see',
 'hear',
 'feel',
 'bio',
 'body',
 'health',
 'sexual',
 'ingest',
 'drives',
 'affiliation',
 'achieve',
 'power',
 'reward',
 'risk',
 'focuspast',
 'focuspresent',
 'focusfuture',
 'relativ',
 'motion',
 'space',
 'time',
 'work',
 'leisure',
 'home',
 'money',
 'relig',
 'death',
 'informal',
 'swear',
 'netspeak',
 'assent',
 'nonflu',
 'filler',
 'AllPunc',
 'Period',
 'Comma',
 'Colon',
 'SemiC',
 'QMark',
 'Exclam',
 'Dash',
 'Quote',
 'Apostro',
 'Parenth',
 'OtherP']

diags = set(totalDF['Diagnosis'])
diagDict = {k: v for v, k in enumerate(diags)}
totalDF.replace({"Diagnosis": diagDict},inplace=True)
modes = set(totalDF['Modality'])
modeDict = {k: v for v, k in enumerate(modes)}
totalDF.replace({"Modality": modeDict},inplace=True)

patientLiwcCols = ['Patient_' + liwcCat for liwcCat in liwcCols]
therapistLiwcCols = ['Therapist_' + liwcCat for liwcCat in liwcCols]
allLiwcCols = patientLiwcCols+therapistLiwcCols
print('allLiwcCols: \n',allLiwcCols)

totalDF.dropna(axis='columns',inplace=True)
print('totalDF.shape: ',totalDF.shape)
X = totalDF[totalDF.columns.intersection(allLiwcCols)] 
print('X.shape: ',X.shape)
Xcols = list(X.columns)
print('X: \n',X)
Y = totalDF[totalDF.columns.intersection(clinicalCols)]
print('Y.shape: ',Y.shape)
Ycols = list(Y.columns)
print('Y: \n',Y)

#Need to scale these things since the variables have different ranges
#sc = StandardScaler(with_mean=True, with_std=True)
#numpyX = sc.fit_transform(X)
#numpyY = sc.fit_transform(Y)

#Turn back into dataframes
#X = pd.DataFrame(numpyX, columns = Xcols)
#Y = pd.DataFrame(numpyY, columns = Ycols)

print('X: \n',X)
print('Y: \n',Y)

In [None]:
for x in clinicalCols:
    print(x)

In [None]:
totalDF.shape

In [None]:
#                Finally ready to try the CCA
#components is the reduced number of components to project to
components = 2
cca = CCA(n_components=components)


print('Original data:')
print('   X.shape: ',X.shape)
print('   Y.shape: ',Y.shape)

#Train 
cca.fit(X, Y)

#Projection of original data into reduced dimensions 
X_red, Y_red = cca.transform(X, Y)
print('Reduced dimension data:')
print('X_red.shape: ',X_red.shape)
print('Y_red.shape: ',Y_red.shape)
print('type(X_red): ',type(X_red))
reducedXDF = pd.DataFrame(data=X_red,columns=['X1','X2'])
reducedYDF = pd.DataFrame(data=Y_red,columns=['Y1','Y2'])
reducedDF = pd.concat([reducedXDF, reducedYDF], axis=1)
print(reducedDF)

#Covariance Matrix
print(cca.coef_)
coeffDF = pd.DataFrame(cca.coef_,index=X.columns,columns=Y.columns)
HTML (coeffDF.to_html())

In [None]:
#                Plot the reduced dimensions
fig = plt.gcf()
fig.set_size_inches(8, 8)
plt.gcf().subplots_adjust(bottom=0.15)

#This is the scatterplot of a distance profile feature vs a raw data feature
fig = plt.gcf()
fig.set_size_inches(16, 6)

xVar = 'X2'  
yVar = 'Y2'
        
ax = sns.scatterplot(data = reducedDF,
                     x=xVar,
                     y=yVar,
                     alpha=1.0,
                     legend='full')
ax.legend(loc='upper right', frameon=False)

plt.show()

In [None]:
#               Weights, Loadings, Scores
#The 'weights' are the correlations for each variable in each component dimension 
#print('cca.x_weights: \n',cca.x_weights_)
weights_X_DF = pd.DataFrame(data=cca.x_weights_,columns=['cca.x_weights_component_1','cca.x_weights_component_2'],index=list(coeffDF.index))
weights_Y_DF = pd.DataFrame(data=cca.y_weights_,columns=['cca.y_weights_component_1','cca.y_weights_component_2'],index=list(coeffDF.columns))
weights_X_DF.sort_values(by=['cca.x_weights_component_1'],inplace=True)
weights_Y_DF.sort_values(by=['cca.y_weights_component_1'],inplace=True)

print(weights_X_DF)
print(weights_Y_DF)

In [None]:
coeffDF.sort_values(by=['Patient Alliance'],ascending=False,inplace=True)
coeffDF.to_csv(coerrFileName)

In [None]:
plot_coercoeff(coeffDF)

In [None]:
#TherapistAllianceStatsDF = ftestRegressionVariable(ftestDF.drop('Therapist Alliance', axis=1),ftestDF['Therapist Alliance'])

In [None]:
#TherapistAllianceStatsDF

In [None]:
#AvoidanceStatsDF = ftestRegressionVariable(ftestDF.drop('Avoidance', axis=1),ftestDF['Avoidance'])

In [None]:
#Run an f-test regression against all the clinical variables
totalDF.dropna(axis='columns',inplace=True)
diags = set(totalDF['Diagnosis'])
diagDict = {k: v for v, k in enumerate(diags)}
totalDF.replace({"Diagnosis": diagDict},inplace=True)
modes = set(totalDF['Modality'])
modeDict = {k: v for v, k in enumerate(modes)}
totalDF.replace({"Modality": modeDict},inplace=True)

metricsToRegress = ['Patient Alliance', 'Therapist Alliance', 'Total Alliance', 'Alliance Difference', 'Diagnosis', 'Modality', 'Goal', 'Task', 'Bond', 'Closeness', 'Dependence', 'Anxiety', 'Avoidance', 'Number of Visits', 'T_Intrusiveness', 'P_Intrusiveness', 'Age']
metricsDF = pd.DataFrame()
for metric in metricsToRegress:
    print('metric: ',metric)
    statsDF = ftestRegressionVariable(totalDF.drop([metric], axis=1),totalDF[metric])
    statsDF.insert(0, 'metric', metric)
    metricsDF = pd.concat([metricsDF, statsDF], axis=0) 
metricsDF.reset_index(inplace=True,drop=True)
metricsFileName = '/Users/Heisig/Jihan/Results/metricsFtest.csv'
metricsDF.to_csv(metricsFileName)

In [None]:
metricsDF

In [None]:
#                               Feature Selection using LASSO
#     Setup variables for classification
testCols = metricsDF.loc[6:10,'feature']
xDF = totalDF[testCols]
X = xDF.drop(clinicalCols, axis=1, errors='ignore')
print(X.shape)
y = totalDF['Patient Alliance']
print(len(y))

#     Recursive Feature Elimination using Lasso
estimator = Lasso(random_state=0,max_iter=10000)
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X, y)
print('selector.support_: ',selector.support_)
print('selector.ranking_: ',selector.ranking_)
print('selector.score(X,y): ',selector.score(X,y))

In [None]:
#      Try OLS Regression
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

In [None]:
#                                   #Apply FDR correction 

targetMetrics = ['Patient Alliance','Therapist Alliance','Total Alliance','Alliance Difference','Diagnosis','Modality','Age','Anxiety','Closeness']
#targetMetrics = ['Patient Alliance']
#pronounFeatures = ['Therapist_we','Therapist_i','Therapist_shehe','Therapist_you','Therapist_they','Patient_we','Patient_i','Patient_shehe','Patient_you','Patient_they']
pronounFeatures = ['Therapist_and_and','Therapist_we','Patient_when_i','Patient_AUX,INTJ','Therapist_pronoun_we','Patient_CCONJ,CCONJ','Therapist_ADP,ADP','Therapist_i',
                   'Therapist_okay_okay','Therapist_i_do','Patient_ADV,INTJ','Patient_INTJ,PRON','Therapist_CCONJ,CCONJ',
                   'Therapist_i_think','Patient_i','Therapist_yeah_yeah','Therapist_INTJ,SCONJ','Patient_nonflu','Patient_and_and','Patient_PRON,PRON']

featureList = pronounFeatures  
p_val_cuttoff = 0.05
numberOfFeatures = len(featureList)
print('numberOfFeatures: ',numberOfFeatures)

for targetMetric in targetMetrics:
    #Remove metrics from features
    fdrDF = metricsDF[metricsDF['feature'].isin(featureList)]
    print('fdrDF.shape: ',fdrDF.shape)
    #Restrict to features for this metric
    fdrDF = fdrDF.loc[fdrDF['metric']==targetMetric]
    print('fdrDF.shape: ',fdrDF.shape)
    fdrDF.reset_index(inplace=True,drop=True)
    
    #Do the correction
    corrected = fdrcorrection(fdrDF['FT-p-val'], alpha=p_val_cuttoff, method='indep', is_sorted=False)
    

    allFdrDF = metricsDF[metricsDF['feature'].isin(featureList)]
    allFdrDF = allFdrDF.loc[allFdrDF['metric']==targetMetric]
    allFdrDF.reset_index(inplace=True,drop=True)
    allFdrDF['featureNum'] = allFdrDF.index

    fdrDF['Survivor'] = corrected[0]
    fdrDF['Corrected-p-val'] = corrected[1]
    fdrDF['featureNum'] = fdrDF.index
    bonferroni_p_val = p_val_cuttoff/numberOfFeatures
    print('bonferroni_p_val: ',bonferroni_p_val)

    fdrFileName = '/Users/Heisig/Jihan/Results/fdrCorrectedMetrics_'+targetMetric+'.csv'
    fdrDF.to_csv(fdrFileName)
   

In [None]:
for feat in fdrDF.feature:
      print(feat)

In [None]:
corrected[1]

In [None]:
p_val_cuttoff = 0.05
#Remove metrics from features
fdrDF = metricsDF.loc[0:55,:].copy()
print('fdrDF.shape: ',fdrDF.shape)

#Restrict to features for this metric
targetMetric = 'Patient Alliance'
fdrDF = fdrDF.loc[fdrDF['metric']==targetMetric]
print('fdrDF.shape: ',fdrDF.shape)
fdrDF.reset_index(inplace=True,drop=True)

#Do the correction
corrected = fdrcorrection(fdrDF['FT-p-val'], alpha=p_val_cuttoff, method='indep', is_sorted=False)
fdrDF['corrected'] = corrected[1]
print(corrected)

fdrFileName = '/Users/Heisig/Jihan/Results/fdrCorrectedMetrics.csv'
fdrDF.to_csv(fdrFileName)

In [None]:
fdrDF

In [None]:
corrected[1]

In [None]:
testCols = list(metricsDF.loc[6:34,'feature'])
testCols.append('Session')
xDF = totalDF[testCols]
differentialFeatureFileName = '/Users/Heisig/Jihan/Results/differentialFeatureDF.csv'
xDF.to_csv(differentialFeatureFileName, index=False)

In [None]:
#     Setup variables for classification
#X = totalDF.drop(clinicalCols, axis=1, errors='ignore')
testCols = metricsDF.loc[6:34,'feature']
xDF = totalDF[testCols]
X = xDF.drop(clinicalCols, axis=1, errors='ignore')
print(X.shape)
y = totalDF['Patient Alliance']
print(len(y))

#Scale
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

#  Try a linear model to predict 
model_l = linear_model.LassoLarsCV(normalize= True)
cv = LeaveOneOut()
scores = cross_val_score(model_l, X_scaled, y, scoring='explained_variance', cv=cv, n_jobs=-1)
scores

In [None]:
testCols = metricsDF.loc[6:34,'feature']
xDF = totalDF[testCols]

In [None]:
xDF

In [None]:
# Try 5 fold LASSO
reg = LassoCV(cv=5, random_state=0,max_iter=10000).fit(X_scaled, y)
#This returns R**2
reg.score(X_scaled, y)

In [None]:
totalDF.dropna(axis='columns',inplace=True)
totalDF.drop('Unnamed: 0',axis=1,inplace=True, errors='ignore')   
dependenceStatsDF = ftestRegressionVariable(totalDF.drop('Dependence', axis=1),totalDF['Dependence'])

In [None]:
#                            Plot two variables
sns.set(style='darkgrid')
fig = plt.gcf()
fig.set_size_inches(32, 8)
plt.gcf().subplots_adjust(bottom=0.15)


ax = sns.scatterplot(data=totalDF,
                     #y='Patient_Honore',
                     y='Therapist_Honore',
                     s=75,
                     x='Patient Alliance')

#Add the sentence to high similarity sentences
#highSimDF = plotDF.loc[plotDF.cosine_similarity>0.45]
#for index, row in highSimDF.iterrows():
#    y = row['index']
#    x = row['cosine_similarity']
#    sentence = row['text']
#    plt.text(x,y,sentence, horizontalalignment='left', size='medium', color='black', weight='light')
 
#tickLabels = list(plotDF['session'])
#tickPositions = list(plotDF['index'])
#ax.set_xticks(tickPositions)
#ax.set_xticklabels(tickLabels,fontsize='medium')

#ax1.set_ylim(0,100)

#Demarcate the speakers
#for index, row in firstSentenceDF.iterrows():
#    ax.axhline(row['index'], ls='--')
#    ax.text(min(plotDF['cosine_similarity']), row['index'], str(row['Session'])+'-'+str(row['Patient Alliance'])+'-'+str(row['Diagnosis']))#

ax.set_xlabel('Patient Alliance',fontsize='xx-large', weight='demi')
ax.set_ylabel('Patient Honore',fontsize='xx-large', weight='demi')
#plt.title('Therapeutic Alliance Cohort '+reference,fontsize='xx-large', weight='bold')

#The magic setting to get ALL ticks to rotate!!!
plt.setp(ax.get_xticklabels(), rotation=30, fontsize='medium');
plt.show()