The goal of this file is to combine the small segments of the forrest gump movie fmri run into a combined pandas dataframe

In [2]:
import pandas as pd
import os 
import pickle as pickle
import csv
import numpy as np

In [7]:
cwd = os.getcwd()
annDir = os.path.join(cwd, 'studyforrest-data-annotations')
segDir = os.path.join(annDir, 'segments', 'avmovie')

In [4]:
segDir

'/Users/ChuckRoast/Documents/GitHub/NMA_project/studyforrest-data-annotations/segments/avmovie'

In [8]:
cwd

'/Users/ChuckRoast/Documents/GitHub/NMA_project'

In [8]:
exFn = "locations_run-4_events.tsv"
int(exFn[14])

4

In [52]:
def add_to_df(dataToAdd, DF):
    # ...
    DF_added = DF
    
    return DF_added

def get_locations_filename(run, segDir, anotType):
    if anotType == 'loc':
        filename = 'locations_run-' + str(run+1) + '_events.tsv'
    elif anotType == 'char':
        filename = 'emotions_av_1s_events_run-' + str(run+1) + '_events.tsv'
    filename = os.path.join(segDir, filename)
    
    return filename

In [11]:
#columns = ["onset", "duration", "major_location", "setting", "locale", "int_or_ext", "flow_of_time", "time_of_day", "run"]
movAnnotations = pd.DataFrame()

runNum = 8
length = 0
for run in range(runNum):
    # open each run
    filenameL = get_locations_filename(run, segDir, 'loc')
    currentSeg = pd.read_csv(filenameL, delimiter='\t')
    
    # add a column for the corresponding run
    numberOfRows = len(currentSeg)
    runArray = np.ones(numberOfRows)*(run+1) 
    currentSeg['run'] = runArray
    
    length += len(currentSeg)

    # add to a main DF 
    movAnnotations = pd.concat([movAnnotations, currentSeg], ignore_index=True)

In [12]:
movAnnotations.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
4,286.32,14.24,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0


## Notes
We want to look for places and people (and possibly time jumps)
- do we subset to a list of places we care about?
- how do we label so they're the same across runs
    - pick locations (of interest) and give them a number system
    - replace with 0, -, + (code for things that happend last -> replacing 0s to have a scale of relevant time in FG's life
    
Data
- major_setting, setting, locale
- very broad, a bit more fine grain, very fine grain
- also have "ext"/"int"; "night"/"day"

Scale of location:
- for most of our questions, we'll want to look at "setting"
- but we can also ask how physical distance may be encoded using "major_setting"

Locations
- reduce them to locations that occur in between runs 
- ex: run 1 has 15 locations that don't appear in run 2 etc.
- is there enough temporal time between locations to look at them?

In [13]:
# Code flow of time to get a coarse coding of how much time has passed

Next steps:
* group "setting" within the same run to sum the duration 
* reducing events -> finding locations that occur multiple times (and in different scenes) 

Helpful way to structure annotations:
* 

In [28]:
def get_unique_value_across_runs(dataFrame, columnLabel, runNum):
    unique_values_across_runs = []
    
    for run in range(runNum):
        unique_runs = pd.unique(dataFrame[dataFrame['run']==run][columnLabel])
        unique_values_across_runs.extend(unique_runs)
    return unique_values_across_runs

def get_unique_counts(count_array):
    unique, counts = np.unique(count_array, return_counts=True)
    uniqueDict = dict(zip(unique, counts))
    return uniqueDict

In [30]:
unique_settings = get_unique_value_across_runs(movAnnotations, 'setting', runNum)
locationDict = get_unique_counts(unique_settings)
locationDict

{'Ambassador Hotel': 1,
 "Black Panther's HQ": 1,
 "Bubba's Grave": 1,
 "Bubba's Home": 1,
 "Bubba's mother's dining room": 1,
 "Dan's apartment": 1,
 'Disco': 1,
 'Foursquare Church': 1,
 'Gump House': 4,
 'Gump Medical Center': 1,
 'Hilton Hotel': 1,
 "Jenny's apartment": 1,
 "Jenny's grandma's trailer": 2,
 "Jenny's house": 1,
 'Lincoln Memorial': 1,
 'Margaret Mitchell Hall': 1,
 'Military Base': 1,
 'Ping Pong Tournament': 1,
 'TV studio': 1,
 'Tidal Basin with Jefferson Memorial': 1,
 'Walk of Fame': 1,
 'Watergate Hotel': 1,
 'White House': 3,
 'access-road': 4,
 'apartment with balcony': 1,
 'apartment with mirrored wall': 1,
 'army bus': 1,
 'autumn forrest': 1,
 'bar in New York': 1,
 'barracks': 1,
 'battlefield in American Zivil War': 1,
 'battlefield in Revolutionary War': 1,
 'battlefield in World War 1': 1,
 'battlefield in World War 2': 1,
 'beacon': 1,
 'bench at bus stop': 6,
 'bridge at mississippi river': 1,
 'bridge near club': 1,
 'bus stop': 1,
 'college graduati

In [31]:
all_unique_major = get_unique_value_across_runs(movAnnotations, 'major_location', runNum)
majorDict = get_unique_counts(all_unique_major)

In [32]:
compressedTimes = pd.DataFrame(columns= movAnnotations.columns)
compressedTimesIndex = 0 

# if setting = previous setting -> add them together and continue iteration 
for i, row in movAnnotations.iterrows( ):
    
    if i == 0:
        compressedTimes = compressedTimes.append(row)
            
    else:
        if row['setting'] == compressedTimes.iloc[compressedTimesIndex]['setting']:
            compressedTimes.iat[compressedTimesIndex,1] = row['duration'] + compressedTimes.iloc[compressedTimesIndex]['duration']
            
        else:
            compressedTimes = compressedTimes.append(row)
            compressedTimesIndex += 1

            
            

In [33]:
compressedTimes.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,39.76,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
6,311.96,6.32,Greenbow Alabama,main street,crossroads,ext,+,day,1.0


In [34]:
np.min(compressedTimes['duration'])

1.16

In [35]:
compressedTimes[compressedTimes['duration']==np.min(compressedTimes['duration'])]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
300,559.96,1.16,Europe,battlefield in World War 1,battlefield in World War 1,ext,++,day,3.0


In [36]:

compressedDict = get_unique_counts(compressedTimes['setting'])

96

In [46]:

multipleDict = dict((k, v) for k, v in compressedDict.items() if v > 2.0)
print(multipleDict)
print(len(multipleDict))

{'Gump House': 17, 'White House': 7, 'access-road': 9, 'bench at bus stop': 21, 'football field': 3, 'harbor': 4, 'main street': 8, 'on the boat': 8, 'tree on a field': 6}
9


In [39]:
# make pandas table to only have ^ multiples (and mark which ones are which?)
# - need to narrow down data formatting to answer questions about representations 

## Add Characters to our movAnnotations DF
* first we'll need to load in the character information 
* then we'll need to add in time points that correspond to major characters
* then we'll need to splice the two (characters DF and movAnnotations) together


In [53]:
emAnnotations = pd.DataFrame()

for run in range(runNum):
    # open emotional annotations to glean out major character information
    
    filename = get_locations_filename(run, segDir, 'char')
    currentSeg = pd.read_csv(filename, delimiter='\t')
    
    # add a column for the corresponding run
    numberOfRows = len(currentSeg)
    runArray = np.ones(numberOfRows)*(run+1) 
    currentSeg['run'] = runArray
    
    length += len(currentSeg)

    # add to a main DF 
    emAnnotations = pd.concat([emAnnotations, currentSeg], ignore_index=True)
    

In [55]:
emAnnotations.head()

Unnamed: 0,onset,duration,character,arousal,valence_positive,valence_negative,c_audio,c_context,c_face,c_gesture,...,e_love,e_pity/compassion,e_pride,e_relief,e_remorse,e_resent,e_sadness,e_satisfaction,e_shame,run
0,192.0,12.0,FORREST,-0.666667,0.666667,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,204.0,2.0,FORREST,-0.777778,0.777778,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,206.0,3.0,FORREST,-0.666667,0.888889,0.0,0.222222,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,209.0,16.0,FORREST,-0.333333,0.555556,0.0,0.111111,0.0,0.333333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,225.0,15.0,FORREST,-0.555556,0.555556,0.0,0.111111,0.0,0.222222,0.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [57]:
majorCharacters = ['MRSGUMP', 'JENNY', 'FORREST', 'BUBBA', 'DAN']

In [61]:
majorCharDf = emAnnotations[emAnnotations['character'].isin(majorCharacters)]

In [64]:
majorCharDf.head()
# 1043 total rows

Unnamed: 0,onset,duration,character,arousal,valence_positive,valence_negative,c_audio,c_context,c_face,c_gesture,...,e_love,e_pity/compassion,e_pride,e_relief,e_remorse,e_resent,e_sadness,e_satisfaction,e_shame,run
0,192.0,12.0,FORREST,-0.666667,0.666667,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,204.0,2.0,FORREST,-0.777778,0.777778,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,206.0,3.0,FORREST,-0.666667,0.888889,0.0,0.222222,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,209.0,16.0,FORREST,-0.333333,0.555556,0.0,0.111111,0.0,0.333333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,225.0,15.0,FORREST,-0.555556,0.555556,0.0,0.111111,0.0,0.222222,0.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [106]:
# make a copy of movAnnotations for now to mess around with the character information 
movCopy = movAnnotations.copy()
skellyChar = np.zeros(len(movCopy))
skellyChar = [str(int(i)) for i in skellyChar]

movCopy['character'] = skellyChar
movCopy.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0,0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,0
3,272.2,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0,0
4,286.32,14.24,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0,0


In [107]:
newIdx = len(movCopy)

for i, row in majorCharDf.iterrows():
    
    thisRun = movCopy[movCopy['run']==row['run']]
    
    # find the nearest onset to the character onset in movAnnotations
    orderByOnset = thisRun.iloc[(thisRun['onset']-row['onset']).abs().argsort()[:2]]
    nearestOnsetIdx = orderByOnset.index.tolist()

    
    # add the character row to movCopy using the location data from the nearest onset 
    infoToAdd = movCopy.iloc[nearestOnsetIdx[0]]
    
    #add row
    movCopy.loc[newIdx] = movCopy.iloc[nearestOnsetIdx[0]]

    
    # change the character value 
    movCopy.iat[newIdx, 9] = row['character']
    movCopy.iat[newIdx, 0] = row['onset']
    movCopy.iat[newIdx, 1] = row['duration']
    
    newIdx += 1

In [108]:
movCopy[movCopy['character']!= '0']

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character
869,192.00,12.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
870,204.00,2.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
871,206.00,3.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
872,209.00,16.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
873,225.00,15.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
874,245.00,15.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
875,275.00,3.0,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0,FORREST
876,278.00,7.0,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0,FORREST
877,285.00,3.0,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0,FORREST
878,308.00,3.0,Greenbow Alabama,main street,crossroads,ext,+,day,1.0,MRSGUMP


In [123]:
test = movCopy.groupby('run').apply(lambda x:x.sort_values('onset'))
test

Unnamed: 0_level_0,Unnamed: 1_level_0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,0,0.00,17.00,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0
1.0,1,17.00,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0,0
1.0,2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,0
1.0,869,192.00,12.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
1.0,870,204.00,2.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
1.0,871,206.00,3.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
1.0,872,209.00,16.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
1.0,873,225.00,15.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
1.0,874,245.00,15.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
1.0,3,272.20,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0,0


In [179]:
movCopy[movCopy['duration'] < 2.0]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,TR
964,2.00,1.00,Savannah,bench at bus stop,bench at bus stop,ext,++,day,2.0,FORREST,0.0
965,3.00,1.00,Savannah,bench at bus stop,bench at bus stop,ext,++,day,2.0,FORREST,0.0
1223,4.84,1.00,Vietnam,embattled jungle,embattled embankment,ext,++,day,4.0,FORREST,0.0
1106,4.92,1.00,United States,barracks,dormitory,int,++,day,3.0,BUBBA,0.0
1362,5.80,1.00,Washington D.C.,protester's camp,protester's camp,ext,++,night,5.0,JENNY,0.0
1363,5.80,1.00,Washington D.C.,protester's camp,protester's camp,ext,++,night,5.0,FORREST,0.0
1107,5.92,1.00,United States,barracks,dormitory,int,++,day,3.0,BUBBA,0.0
967,6.00,1.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,2.0,FORREST,0.0
968,9.00,1.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,2.0,FORREST,0.0
87,15.80,1.16,Gump property,access-road,alley to Gump House,ext,0,day,2.0,0,0.0



# Adding chronology to the annotations dataframe Here I have added a column with the measurement "chronology" which is a time line of the movie. 

0 - no discernible time (only one instance when flashing back to antebellum war time, KKK footage 

1 - young Forrest, early 1950s 

2 - Highschool and college Forrest, early 1960s 

3 - mid 1960s, Vietnam War 

4 - late 1960s, March on the Pentagon 

5 - Early 1970s, ping Pong championship in China, New Years with Dan 

6 - mid 1970s, Watergate, Hurricane Carmen 

7 - later 1970s, running, a lot of running 

8 - early 1980s, at the bench and meeting Jenny and son 

9 - year after marrying Jenny, end scene at the tree/hill 

In [9]:
chronoAnn = pd.read_csv("chronoAnn.csv")
chronoAnn

Unnamed: 0,run,Unnamed: 1,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run.1,character,chronology
0,1,0,0.00,17.00,Paramount,mountain logo,mountain logo,ext,0,day,1,0,0
1,1,1,17.00,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1,0,0
2,1,2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1,0,8
3,1,869,192.00,12.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1,FORREST,8
4,1,870,204.00,2.00,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1,FORREST,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1907,8,1908,603.56,10.00,Gump property,access-road,at the mail boxes,ext,0,day,8,FORREST,9
1908,8,868,611.56,60.20,Gump property,access-road,at the mail boxes,ext,0,day,8,0,9
1909,8,1909,613.56,1.00,Gump property,access-road,at the mail boxes,ext,0,day,8,FORREST,9
1910,8,1910,614.56,3.00,Gump property,access-road,at the mail boxes,ext,0,day,8,FORREST,9


# NOTE: WE NEED TO OFFSET ONSET BY THE RUN THEY'RE IN

In [170]:
len(movCopy) - len(pd.unique(movCopy['onset']))


# Run 1 -> 2 repeats of onset
# Run 2 -> 12
# Run 3 -> 5
# 4-> 8
# 5 -> 13
# 6-> 6
# 7-> 11
# 8 -> 8

123

In [178]:
A = movCopy[movCopy['run']==8.0]

len(A) - len(pd.unique(A['onset']))

8

## Section: Organizing data to relfect TRs
Make a combined DF where we will have a row for every TR with 1) annotations of setting(/location) and major characters

In [None]:
# create function
# find list of onscreen annotations falls in a given TR 


def get_TR_people_and_events(TR, movAnnotationsDf):
    
    return allEventsInTR

In [131]:
import decimal

def float_range(start, stop, step):
    while start < stop:
        yield float(start)
        start += decimal.Decimal(step)

print(list(float_range(0, 1, '0.1')))

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


In [155]:
def find_nearest_time(time1, secondDf, DfCol):
    orderByNearest = secondDf.iloc[(secondDf[DfCol]-time1).abs().argsort()[:2]]
    
    return orderByNearest.index.tolist()

In [161]:
TRs = np.array(list(float_range(0, 40, 2.0)))
TRs

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20., 22., 24.,
       26., 28., 30., 32., 34., 36., 38.])

### Big bad loop

In [162]:
a = movCopy.columns
a[0]

'onset'

In [165]:
addZerosForTR = np.zeros(len(movCopy))
movCopy['TR'] = addZerosForTR

In [166]:
movCopy.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,TR
963,0.0,2.0,Savannah,bench at bus stop,bench at bus stop,ext,++,day,2.0,FORREST,0.0
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0,0.0
84,0.08,5.08,Savannah,bench at bus stop,bench at bus stop,ext,++,day,2.0,0,0.0
1808,0.56,2.0,Savannah,Jenny's apartment,Jenny's apartment,int,0,day,8.0,JENNY,0.0
792,1.48,6.12,Savannah,Jenny's apartment,Jenny's apartment,int,0,day,8.0,0,0.0


In [167]:
columns = movCopy.columns
columns = [columns[i] for i in range(len(columns))]
columns.extend(['TR'])
TRdf = pd.DataFrame(columns=columns)

TRidx = 0

for TR in TRs:
    # do some stuff
    
    # find nearest timepoint from our movAnnotations
    nearestTime = find_nearest_time(TR, movCopy, 'onset')
    timeIdx = nearestTime[0]
    print(TR, movCopy['onset'][timeIdx])
    
    if TR == movCopy['onset'][timeIdx]:
        print('this would be great if this was true for all TRs')
    else:
        print("we will need to align the time")
    
    
    # 10th column?
    TRdf.loc[TRidx] = movCopy.iloc[nearestOnsetIdx[0]]
    TRidx += 1

0.0 0.0
this would be great if this was true for all TRs
2.0 2.0
this would be great if this was true for all TRs
4.0 4.0
this would be great if this was true for all TRs
6.0 6.0
this would be great if this was true for all TRs
8.0 7.72
we will need to align the time
10.0 10.0
this would be great if this was true for all TRs
12.0 13.8
we will need to align the time
14.0 13.8
we will need to align the time
16.0 16.0
this would be great if this was true for all TRs
18.0 18.88
we will need to align the time
20.0 20.0
this would be great if this was true for all TRs
22.0 21.96
we will need to align the time
24.0 24.0
this would be great if this was true for all TRs
26.0 26.0
this would be great if this was true for all TRs
28.0 27.8
we will need to align the time
30.0 30.0
this would be great if this was true for all TRs
32.0 31.72
we will need to align the time
34.0 33.96
we will need to align the time
36.0 35.84
we will need to align the time
38.0 37.72
we will need to align the time


In [148]:
TRdf

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,TR
