# Creating a similarity matrix from behavioral ratings

In [1]:
import pandas as pd 
import numpy as np

### <font color='hotpink'>Preprocess data </font>
Here, we want to compute the average ratings made by CloudResearch participants across a number of variables: how much interpersonal conflict, tension, or violence was present in the scene (each on a scale of 1-7, 7 being the highest amount), whether or not there was a social interaction present in the clip, how many people were present in the clip, how many people were actually interacting in the clip, whether they had seen that particular scene before, and how positive or negative the clip made them feel (1 is very negative and 7 is very positive). 

In [2]:
df = pd.read_csv('nndb_raters_responses_cleaned.csv')
df.dropna(inplace=True)

In [3]:
# Create dummy variables for 'watched' and 'social'

def dummy_variable(dataframe, column, value_to_replace, replacement_value):
    for i in range(dataframe.index.max()+1):
        if dataframe.loc[i, str(column)] == value_to_replace:
            dataframe.loc[i, str(column)] = replacement_value

In [4]:
dummy_variable(df, "social", 'yes', 1)
dummy_variable(df, "social", 'no', 0)
dummy_variable(df, "watched", 'Yes', 1)
dummy_variable(df, "watched", 'No', 0)

In [5]:
def group_and_merge(dataframe):
    avg_subset = dataframe[['watched', 'social', 'feeling', 'tension', 'conflict','violence', 'movieID']]
    df1 = avg_subset.groupby('movieID').mean().reset_index()

    median_subset = dataframe[['count_people','count_interactions','movieID']]
    df2 = median_subset.groupby('movieID').median().reset_index()

    joined = pd.merge(df1, df2, how='left', left_on=['movieID'], right_on=['movieID'])

    return joined

In [6]:
results_df = group_and_merge(df)

#add a column for movie name only 
for i in range(results_df.index.max()+1):

    clip = results_df.loc[i, "movieID"]
    movie = clip.split('_')[0]
    results_df.loc[i, "movie"] = movie


'''
Don't re-generate; I added leading zeroes in the movie IDs with timestamps that 
were fewer than 4 integer lengths to allow filenames to be sorted numerically.
'''
# results_df.to_csv("movie_tcv_ratings.csv", index=False)



"\nDon't re-generate; I added leading zeroes in the movie IDs with timestamps that \nwere fewer than 4 integer lengths to allow filenames to be sorted numerically.\n"

### <font color='hotpink'>Calculate inter-rater reliability</font>
We collected a total of 150 participant ratings. Clips were randomly presented, and as such we have variation among participants with how they might have rated each clip. In this next section, we look to determine how reliable our clip ratings are by performing a repeated split half approach.

In [7]:
import random

In [8]:
#need to compare by movie! otherwise correlation is very low.

col_list = ['watched', 'social', 'feeling', 'tension', 'conflict','violence','count_people','count_interactions']
corr_df = pd.DataFrame(columns=col_list)

#set a random seed to ensure reproducibility 
random.seed(0)

for i in range(0,100):
    subject_list = df['userID'].unique()

    random.shuffle(subject_list)

    half = len(subject_list) // 2

    split1 = df[df['userID'].isin(subject_list[:half])]
    split2 = df[df['userID'].isin(subject_list[half:])]

    split1_calc = group_and_merge(split1)
    split2_calc = group_and_merge(split2)

    i_corr = []

    for col in col_list:
        i_corr.append(split1_calc[col].corr(split2_calc[col]))
    
    corr_df.loc[len(corr_df.index)] = i_corr

In [9]:
for i in corr_df.columns:
    r = corr_df[i].mean()
    print(i,  "SBP: ", 2*r/(1+r))

watched SBP:  0.6052765877278105
social SBP:  0.4212931971906028
feeling SBP:  0.7648424106343414
tension SBP:  0.7763123108119593
conflict SBP:  0.7940296653447201
violence SBP:  0.7599714787542613
count_people SBP:  0.813926604566568
count_interactions SBP:  0.6832710232482896


In [10]:
for i in corr_df.columns:
    r = corr_df[i].mean()
    print(i, r)

watched 0.43397607181608466
social 0.26685968315388775
feeling 0.6192265806561108
tension 0.6344039559040341
conflict 0.6584155866252623
violence 0.6128661282651712
count_people 0.686236288327782
count_interactions 0.5189154604418879


<font color='deeppink'>Reveal clips that have low agreement on whether or not the clip contains a social interaction:</font>

In [11]:
for i in range(results_df.index.max()+1):
    if results_df.loc[i, "social"] <= 0.5:
        print("Clip:",results_df.loc[i, "movieID"], "has low agreement level: ", results_df.loc[i, "social"])

Clip: citizenfour_2373_2390 has low agreement level:  0.4166666666666667


<font color='deeppink'>Evaluate wheter there are meaningful differences between those who have watched the film vs those who have not.</font>

In [12]:
col_list2 = ['social', 'feeling', 'tension', 'conflict','violence','count_people','count_interactions']

watched = df[df.watched == 1].reset_index()
unwatched = df[df.watched == 0].reset_index()

watched_split = group_and_merge(watched)
unwatched_split = group_and_merge(unwatched)

for i in range(unwatched_split.index.max()+1):
    
    clip = unwatched_split.loc[i, "movieID"]
    
    if clip not in watched_split.movieID.unique(): 
        unwatched_split.drop(i, axis=0, inplace=True)
    
unwatched_split.reset_index(inplace=True)

In [13]:
#check to make sure the columns are aligned

for i in range(watched_split.index.max()+1):
    if watched_split.loc[i, "movieID"] != unwatched_split.loc[i, "movieID"]:
        print(watched_split.loc[i, "movieID"])
    else:
        continue

#they are!

In [14]:
watched_df = pd.DataFrame(columns=col_list2)

w_corr = []

for col in col_list2:
    w_corr.append(watched_split[col].corr(unwatched_split[col]))

watched_df.loc[len(watched_df.index)] = w_corr

watched_df.T
#pretty nicely correlated, which is encouraging!

Unnamed: 0,0
social,-0.042998
feeling,0.609194
tension,0.542284
conflict,0.648338
violence,0.633678
count_people,0.682536
count_interactions,0.510198


# <font color='deeppink'>Generating a similarity matrix</font>
Now that we've checked for reliability among our participants, we want to construct a similarity matrix between each movie clip for all 8 types of ratings. So, for example, we want to see how "far apart" clip_1 is to clip_10 on tension, as well as violence, and so on. We will use these matrices as predictors for neural data later in this analysis.

In [15]:
from sklearn.metrics import pairwise_distances

In [16]:
derivatives = "/Users/f004p74/Documents/dartmouth/projects/NNDb/derivatives/"

df = pd.read_csv("movie_tcv_ratings.csv")
df.sort_values(by=['movieID'], ascending=True, inplace=True)
df.reset_index(inplace=True, drop=True)
col_list = ['watched', 'social', 'feeling', 'tension', 'conflict', 'violence', 'count_people', 'count_interactions']

In [17]:
for col in col_list:
    arr = np.array(df[col])
    arr = arr.reshape(-1,1)

    distance_matrix = pairwise_distances(arr)

    distance_df = pd.DataFrame(distance_matrix)

    labels = df[['movieID', 'movie']]

    new_df = pd.concat([labels, distance_df], axis=1)

    #new_df.to_csv(derivatives+"behavioral_matrices/"+col+"_distance_matrix.csv", index=False)
