The goal of this file is to combine the small segments of the forrest gump movie fmri run into a combined pandas dataframe

In [2]:
import pandas as pd
import os 
import pickle as pickle
import csv
import numpy as np

In [5]:
cwd = os.getcwd()
annDir = os.path.join(cwd, 'studyforrest-data-annotations')
segDir = os.path.join(annDir, 'segments', 'avmovie')

In [6]:
segDir

'/Users/sarahsweigart/Desktop/NMA_project/studyforrest-data-annotations/segments/avmovie'

In [7]:
exFn = "locations_run-4_events.tsv"
int(exFn[14])

4

In [8]:
def add_to_df(dataToAdd, DF):
    # ...
    DF_added = DF
    
    return DF_added

def get_locations_filename(run, segDir, anotType):
    if anotType == 'loc':
        filename = 'locations_run-' + str(run+1) + '_events.tsv'
    elif anotType == 'char':
        filename = 'emotions_av_1s_events_run-' + str(run+1) + '_events.tsv'
    filename = os.path.join(segDir, filename)
    
    return filename

In [9]:
#columns = ["onset", "duration", "major_location", "setting", "locale", "int_or_ext", "flow_of_time", "time_of_day", "run"]
movAnnotations = pd.DataFrame()

runNum = 8
length = 0
for run in range(runNum):
    # open each run
    filenameL = get_locations_filename(run, segDir, 'loc')
    currentSeg = pd.read_csv(filenameL, delimiter='\t')
    
    # add a column for the corresponding run
    numberOfRows = len(currentSeg)
    runArray = np.ones(numberOfRows)*(run+1) 
    currentSeg['run'] = runArray
    
    length += len(currentSeg)

    # add to a main DF 
    movAnnotations = pd.concat([movAnnotations, currentSeg], ignore_index=True)

In [10]:
movAnnotations.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
4,286.32,14.24,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0


## Notes
We want to look for places and people (and possibly time jumps)
- do we subset to a list of places we care about?
- how do we label so they're the same across runs
    - pick locations (of interest) and give them a number system
    - replace with 0, -, + (code for things that happend last -> replacing 0s to have a scale of relevant time in FG's life
    
Data
- major_setting, setting, locale
- very broad, a bit more fine grain, very fine grain
- also have "ext"/"int"; "night"/"day"

Scale of location:
- for most of our questions, we'll want to look at "setting"
- but we can also ask how physical distance may be encoded using "major_setting"

Locations
- reduce them to locations that occur in between runs 
- ex: run 1 has 15 locations that don't appear in run 2 etc.
- is there enough temporal time between locations to look at them?

In [11]:
# Code flow of time to get a coarse coding of how much time has passed

Next steps:
* group "setting" within the same run to sum the duration 
* reducing events -> finding locations that occur multiple times (and in different scenes) 

Helpful way to structure annotations:
* 

In [12]:
def get_unique_value_across_runs(dataFrame, columnLabel, runNum):
    unique_values_across_runs = []
    
    for run in range(runNum):
        unique_runs = pd.unique(dataFrame[dataFrame['run']==run][columnLabel])
        unique_values_across_runs.extend(unique_runs)
    return unique_values_across_runs

def get_unique_counts(count_array):
    unique, counts = np.unique(count_array, return_counts=True)
    uniqueDict = dict(zip(unique, counts))
    return uniqueDict

In [51]:
unique_settings = get_unique_value_across_runs(movAnnotations, 'setting', runNum)
locationDict = get_unique_counts(unique_settings)


In [14]:
all_unique_major = get_unique_value_across_runs(movAnnotations, 'major_location', runNum)
majorDict = get_unique_counts(all_unique_major)

In [15]:
compressedTimes = pd.DataFrame(columns= movAnnotations.columns)
compressedTimesIndex = 0 

# if setting = previous setting -> add them together and continue iteration 
for i, row in movAnnotations.iterrows( ):
    
    if i == 0:
        compressedTimes = compressedTimes.append(row)
            
    else:
        if row['setting'] == compressedTimes.iloc[compressedTimesIndex]['setting']:
            compressedTimes.iat[compressedTimesIndex,1] = row['duration'] + compressedTimes.iloc[compressedTimesIndex]['duration']
            
        else:
            compressedTimes = compressedTimes.append(row)
            compressedTimesIndex += 1

            
            

In [16]:
compressedTimes.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,39.76,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
6,311.96,6.32,Greenbow Alabama,main street,crossroads,ext,+,day,1.0


In [17]:
np.min(compressedTimes['duration'])

1.16

In [18]:
compressedTimes[compressedTimes['duration']==np.min(compressedTimes['duration'])]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
300,559.96,1.16,Europe,battlefield in World War 1,battlefield in World War 1,ext,++,day,3.0


In [19]:

compressedDict = get_unique_counts(compressedTimes['setting'])

In [20]:

multipleDict = dict((k, v) for k, v in compressedDict.items() if v > 2.0)
print(multipleDict)
print(len(multipleDict))

{'Gump House': 17, 'White House': 7, 'access-road': 9, 'bench at bus stop': 21, 'football field': 3, 'harbor': 4, 'main street': 8, 'on the boat': 8, 'tree on a field': 6}
9


In [21]:
# make pandas table to only have ^ multiples (and mark which ones are which?)
# - need to narrow down data formatting to answer questions about representations 

## Add Characters to our movAnnotations DF
* first we'll need to load in the character information 
* then we'll need to add in time points that correspond to major characters
* then we'll need to splice the two (characters DF and movAnnotations) together


In [22]:
emAnnotations = pd.DataFrame()

for run in range(runNum):
    # open emotional annotations to glean out major character information
    
    filename = get_locations_filename(run, segDir, 'char')
    currentSeg = pd.read_csv(filename, delimiter='\t')
    
    # add a column for the corresponding run
    numberOfRows = len(currentSeg)
    runArray = np.ones(numberOfRows)*(run+1) 
    currentSeg['run'] = runArray
    
    length += len(currentSeg)

    # add to a main DF 
    emAnnotations = pd.concat([emAnnotations, currentSeg], ignore_index=True)
    

In [23]:
emAnnotations.head()

Unnamed: 0,onset,duration,character,arousal,valence_positive,valence_negative,c_audio,c_context,c_face,c_gesture,...,e_love,e_pity/compassion,e_pride,e_relief,e_remorse,e_resent,e_sadness,e_satisfaction,e_shame,run
0,192.0,12.0,FORREST,-0.666667,0.666667,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,204.0,2.0,FORREST,-0.777778,0.777778,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,206.0,3.0,FORREST,-0.666667,0.888889,0.0,0.222222,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,209.0,16.0,FORREST,-0.333333,0.555556,0.0,0.111111,0.0,0.333333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,225.0,15.0,FORREST,-0.555556,0.555556,0.0,0.111111,0.0,0.222222,0.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
majorCharacters = ['MRSGUMP', 'JENNY', 'FORREST', 'BUBBA', 'DAN']

In [25]:
majorCharDf = emAnnotations[emAnnotations['character'].isin(majorCharacters)]

In [26]:
majorCharDf.head()
# 1043 total rows

Unnamed: 0,onset,duration,character,arousal,valence_positive,valence_negative,c_audio,c_context,c_face,c_gesture,...,e_love,e_pity/compassion,e_pride,e_relief,e_remorse,e_resent,e_sadness,e_satisfaction,e_shame,run
0,192.0,12.0,FORREST,-0.666667,0.666667,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,204.0,2.0,FORREST,-0.777778,0.777778,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,206.0,3.0,FORREST,-0.666667,0.888889,0.0,0.222222,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,209.0,16.0,FORREST,-0.333333,0.555556,0.0,0.111111,0.0,0.333333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,225.0,15.0,FORREST,-0.555556,0.555556,0.0,0.111111,0.0,0.222222,0.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [198]:
# make a copy of movAnnotations for now to mess around with the character information 
movCopy = movAnnotations.copy()
skellyChar = np.zeros(len(movCopy))
skellyChar = [str(int(i)) for i in skellyChar]

movCopy['character'] = skellyChar
movCopy.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0,0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,0
3,272.2,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0,0
4,286.32,14.24,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0,0


In [199]:
newIdx = len(movCopy)

for i, row in majorCharDf.iterrows():
    
    thisRun = movCopy[movCopy['run']==row['run']]
    
    # find the nearest onset to the character onset in movAnnotations
    orderByOnset = thisRun.iloc[(thisRun['onset']-row['onset']).abs().argsort()[:2]]
    nearestOnsetIdx = orderByOnset.index.tolist()

    
    # add the character row to movCopy using the location data from the nearest onset 
    infoToAdd = movCopy.iloc[nearestOnsetIdx[0]]
    
    #add row
    movCopy.loc[newIdx] = movCopy.iloc[nearestOnsetIdx[0]]

    
    # change the character value 
    movCopy.iat[newIdx, 9] = row['character']
    movCopy.iat[newIdx, 0] = row['onset']
    movCopy.iat[newIdx, 1] = row['duration']
    
    newIdx += 1

In [200]:
test = movCopy.groupby('run').apply(lambda x:x.sort_values('onset'))

# NOTE: WE NEED TO OFFSET ONSET BY THE RUN THEY'RE IN

In [52]:
len(movCopy) - len(pd.unique(movCopy['onset']))


# Run 1 -> 2 repeats of onset
# Run 2 -> 12
# Run 3 -> 5
# 4-> 8
# 5 -> 13
# 6-> 6
# 7-> 11
# 8 -> 8

123

In [117]:
A = movCopy[movCopy['run']==4.0]

#examples of which onsets are duplicates 
counts_of_onset = get_unique_counts(A[['onset']])
duplicates =  dict((k, v) for k, v in counts_of_onset.items() if v > 1.0)
A[A['onset'].isin(duplicates.keys())]




Unnamed: 0_level_0,Unnamed: 1_level_0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,contin_onset
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4.0,1231,48.84,6.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,FORREST,48.84
4.0,1230,48.84,5.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,DAN,48.84
4.0,1229,48.84,1.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,BUBBA,48.84
4.0,1242,90.84,6.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,BUBBA,90.84
4.0,1243,90.84,7.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,FORREST,90.84
4.0,1254,114.84,6.0,Vietnam,embattled jungle,in the embattled jungle,ext,0,day,4.0,FORREST,114.84
4.0,1253,114.84,6.0,Vietnam,embattled jungle,in the embattled jungle,ext,0,day,4.0,BUBBA,114.84
4.0,1271,224.84,24.0,Vietnam,embattled jungle,finding place of Dan,ext,0,day,4.0,FORREST,224.84
4.0,1270,224.84,6.0,Vietnam,embattled jungle,finding place of Dan,ext,0,day,4.0,DAN,224.84
4.0,1317,491.84,1.0,Vietnam,hospital dormitory,hospital dormitory,int,-,day,4.0,FORREST,491.84


## Section: Organizing data to reflect TRs
Make a combined DF where we will have a row for every TR with 1) annotations of setting(/location) and major characters

In [245]:
movCopy = test.copy()

#list of all the lengths of runs
lengthOfRun = []
for allrun in range(runNum):
    helper_mov = movCopy[movCopy['run'] == allrun+1]
    max_row = helper_mov[helper_mov['onset']==[helper_mov['onset'].max()]]
    lengthOfRun.append(max_row['duration'].max() + max_row['onset'].max())

#create an empty array the length of the run 
new_times = np.zeros(len(movCopy))

#create array of summed secons from previous runs, and add a zero at the beginning so you don't add anything to the first run
offset_onsets = np.cumsum(lengthOfRun)
offset_onsets = np.insert(offset_onsets,0,0)


#add whatever currrent value 
for every_onset in range(len(new_times)):
    new_times[every_onset] = movCopy['onset'].iloc[every_onset]+offset_onsets[int(movCopy['run'].iloc[every_onset])-1]

#add the new onset times to the dataframe
movCopy['cont_onset'] = new_times
movCopy.tail()



[   0.     17.    168.08 ... 7053.   7054.   7057.  ]


Unnamed: 0_level_0,Unnamed: 1_level_0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8.0,1908,603.56,10.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7043.0
8.0,868,611.56,60.2,Gump property,access-road,at the mail boxes,ext,0,day,8.0,0,7051.0
8.0,1909,613.56,1.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7053.0
8.0,1910,614.56,3.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7054.0
8.0,1911,617.56,3.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7057.0


### Make sure the continuous onset times make sense with the annotations/ data!!

In [215]:
movCopy['run'] == allrun+1

[run      
 1.0  0       False
      1       False
      2       False
      869     False
      870     False
      871     False
      872     False
      873     False
      874     False
      3       False
      875     False
      876     False
      877     False
      4       False
      5       False
      878     False
      879     False
      6       False
      7       False
      8       False
      9       False
      880     False
      10      False
      11      False
      12      False
      13      False
      881     False
      882     False
      883     False
      884     False
              ...  
 8.0  1898     True
      853      True
      854      True
      1899     True
      855      True
      856      True
      857      True
      1900     True
      1901     True
      858      True
      859      True
      1902     True
      860      True
      861      True
      1903     True
      862      True
      1904     True
      863      True
      864

In [None]:
# create function
# find list of onscreen annotations falls in a given TR 


def get_TR_people_and_events(TR, movAnnotationsDf):
    
    return allEventsInTR

In [131]:
import decimal

def float_range(start, stop, step):
    while start < stop:
        yield float(start)
        start += decimal.Decimal(step)

print(list(float_range(0, 1, '0.1')))

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


In [155]:
def find_nearest_time(time1, secondDf, DfCol):
    orderByNearest = secondDf.iloc[(secondDf[DfCol]-time1).abs().argsort()[:2]]
    
    return orderByNearest.index.tolist()

In [161]:
TRs = np.array(list(float_range(0, 40, 2.0)))
TRs

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20., 22., 24.,
       26., 28., 30., 32., 34., 36., 38.])

### Big bad loop

In [162]:
a = movCopy.columns
a[0]

'onset'

In [165]:
addZerosForTR = np.zeros(len(movCopy))
movCopy['TR'] = addZerosForTR

In [166]:
movCopy.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,TR
963,0.0,2.0,Savannah,bench at bus stop,bench at bus stop,ext,++,day,2.0,FORREST,0.0
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0,0.0
84,0.08,5.08,Savannah,bench at bus stop,bench at bus stop,ext,++,day,2.0,0,0.0
1808,0.56,2.0,Savannah,Jenny's apartment,Jenny's apartment,int,0,day,8.0,JENNY,0.0
792,1.48,6.12,Savannah,Jenny's apartment,Jenny's apartment,int,0,day,8.0,0,0.0


In [1]:
columns = movCopy.columns
columns = [columns[i] for i in range(len(columns))]
columns.extend(['TR'])
TRdf = pd.DataFrame(columns=columns)

TRidx = 0

for TR in TRs:
    # do some stuff
    
    # find nearest timepoint from our movAnnotations
    nearestTime = find_nearest_time(TR, movCopy, 'onset')
    timeIdx = nearestTime[0]
    print(TR, movCopy['onset'][timeIdx])
    
    if TR == movCopy['onset'][timeIdx]:
        print('this would be great if this was true for all TRs')
    else:
        print("we will need to align the time")
    
    
    # 10th column?
    TRdf.loc[TRidx] = movCopy.iloc[nearestOnsetIdx[0]]
    TRidx += 1

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-1dfe191cc450>", line 1, in <module>
    columns = movCopy.columns
NameError: name 'movCopy' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2018, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py", line 1095, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.

NameError: name 'movCopy' is not defined

In [148]:
TRdf

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,TR
