The goal of this file is to combine the small segments of the forrest gump movie fmri run into a combined pandas dataframe

In [43]:
import pandas as pd
import os 
import pickle as pickle
import csv
import numpy as np

In [23]:
cwd = os.getcwd()
annDir = os.path.join(cwd, 'studyforrest-data-annotations')
segDir = os.path.join(annDir, 'segments', 'avmovie')

In [24]:
segDir

'C:\\Users\\elawl\\Documents\\2020_NMA_Group_Project\\studyforrest-data-annotations\\segments\\avmovie'

In [13]:
exFn = "locations_run-4_events.tsv"
int(exFn[14])

4

In [54]:
def add_to_df(dataToAdd, DF):
    # ...
    DF_added = DF
    
    return DF_added

def get_locations_filename(run, segDir):
    filename = 'locations_run-' + str(run+1) + '_events.tsv'
    filename = os.path.join(segDir, filename)
    
    return filename

In [215]:
#columns = ["onset", "duration", "major_location", "setting", "locale", "int_or_ext", "flow_of_time", "time_of_day", "run"]
movAnnotations = pd.DataFrame()
movAnnotations

runNum = 8
length = 0
for run in range(runNum):
    # open each run
    filename = get_locations_filename(run, segDir)
    currentSeg = pd.read_csv(filename, delimiter='\t')
   
    # add a column for the corresponding run
    numberOfRows = len(currentSeg)
    runArray = np.ones(numberOfRows)*(run+1) 
    currentSeg['run'] = runArray
    
    length += len(currentSeg)

    # add to a main DF 
    movAnnotations = pd.concat([movAnnotations, currentSeg], ignore_index=True)

In [219]:
print(len(movAnnotations), length)

869 869


In [218]:
movAnnotations.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
4,286.32,14.24,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0


## Notes
We want to look for places and people (and possibly time jumps)
- do we subset to a list of places we care about?
- how do we label so they're the same across runs
    - pick locations (of interest) and give them a number system
    - replace with 0, -, + (code for things that happend last -> replacing 0s to have a scale of relevant time in FG's life
    
Data
- major_setting, setting, locale
- very broad, a bit more fine grain, very fine grain
- also have "ext"/"int"; "night"/"day"

Scale of location:
- for most of our questions, we'll want to look at "setting"
- but we can also ask how physical distance may be encoded using "major_setting"

Locations
- reduce them to locations that occur in between runs 
- ex: run 1 has 15 locations that don't appear in run 2 etc.
- is there enough temporal time between locations to look at them?

In [60]:
# Code flow of time to get a coarse coding of how much time has passed

Next steps:
* group "setting" within the same run to sum the duration 
* reducing events -> finding locations that occur multiple times (and in different scenes) 

Helpful way to structure annotations:
* 

In [71]:
# get unique values in each run 

all_unique_locations = [] #{}

for run in range(runNum):
    unique_run_settings = pd.unique(movAnnotations[movAnnotations['run']==run]['setting'])
    all_unique_locations.extend(unique_run_settings)
    
    
    #all_unique_locations['run'+str(run)] = unique_run_settings

In [162]:
unique, counts = np.unique(all_unique_locations, return_counts=True)
locationDict = dict(zip(unique, counts))
#locationDict

In [77]:
all_unique_major = [] #{}

for run in range(runNum):
    unique_run_major = pd.unique(movAnnotations[movAnnotations['run']==run]['major_location'])
    all_unique_major.extend(unique_run_major)
    
    
    #all_unique_locations['run'+str(run)] = unique_run_settings

In [161]:
unique, counts = np.unique(all_unique_major, return_counts=True)
majorDict = dict(zip(unique, counts))
#majorDict

In [80]:
unique_settings = pd.unique(movAnnotations['setting'])

In [125]:
unique, counts = np.unique(unique_settings, return_counts=True)
settings = dict(zip(unique, counts))
#settings

In [220]:
compressedTimes = pd.DataFrame(columns= movAnnotations.columns)
compressedTimesIndex = 0 

# if setting = previous setting -> add them together and continue iteration 
for i, row in movAnnotations.iterrows( ):
    
    if i == 0:
        compressedTimes = compressedTimes.append(row)
            
    else:
        if row['setting'] == compressedTimes.iloc[compressedTimesIndex]['setting']:
            compressedTimes.iat[compressedTimesIndex,1] = row['duration'] + compressedTimes.iloc[compressedTimesIndex]['duration']
            
        else:
            compressedTimes = compressedTimes.append(row)
            compressedTimesIndex += 1

            
            

In [221]:
len(compressedTimes)

186

In [222]:
len(movAnnotations)

869

In [223]:
compressedTimes.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,39.76,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
6,311.96,6.32,Greenbow Alabama,main street,crossroads,ext,+,day,1.0


In [224]:
len(movAnnotations[movAnnotations['run']==3.0])

94

In [225]:
len(movAnnotations[movAnnotations['run']==2.0])

155

In [226]:
len(movAnnotations[movAnnotations['run']==4.0])

137

In [228]:
movAnnotations.iloc[290:305]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
290,465.6,39.16,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
291,504.76,4.64,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
292,509.4,4.56,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
293,513.96,13.24,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
294,527.2,2.0,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
295,529.2,5.44,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
296,534.64,19.76,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
297,554.4,2.48,Vietnam,Military Base,at Dan's tent,ext,0,day,3.0
298,556.88,1.48,United States,battlefield in Revolutionary War,battlefield in Revolutionary War,ext,-,day,3.0
299,558.36,1.6,United States,battlefield in American Zivil War,battlefield in American Zivil War,ext,++,day,3.0


In [229]:
movAnnotations[movAnnotations['setting']=='battlefield in World War 1']

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
300,559.96,1.16,Europe,battlefield in World War 1,battlefield in World War 1,ext,++,day,3.0


In [231]:
np.min(compressedTimes['duration'])

1.16

In [232]:
compressedTimes[compressedTimes['duration']==np.min(compressedTimes['duration'])]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
300,559.96,1.16,Europe,battlefield in World War 1,battlefield in World War 1,ext,++,day,3.0


In [233]:

unique, counts = np.unique(compressedTimes['setting'], return_counts=True)
compressedDict = dict(zip(unique, counts))


In [234]:
compressedDict 

{'Ambassador Hotel': 1,
 "Black Panther's HQ": 1,
 "Bubba's Grave": 2,
 "Bubba's Home": 2,
 "Bubba's mother's dining room": 1,
 "Dan's apartment": 1,
 'Disco': 1,
 'Foursquare Church': 2,
 'Gump House': 17,
 'Gump Medical Center': 1,
 'Hilton Hotel': 1,
 "Jenny's apartment": 1,
 "Jenny's grandma's trailer": 2,
 "Jenny's house": 2,
 'Lincoln Memorial': 1,
 'Margaret Mitchell Hall': 1,
 'Military Base': 2,
 'Ping Pong Tournament': 1,
 'TV studio': 1,
 'Tidal Basin with Jefferson Memorial': 1,
 'Walk of Fame': 1,
 'Watergate Hotel': 1,
 'White House': 7,
 'access-road': 9,
 'apartment with balcony': 1,
 'apartment with mirrored wall': 1,
 'army bus': 2,
 'autumn forrest': 1,
 'bar in New York': 2,
 'barracks': 1,
 'battlefield in American Zivil War': 1,
 'battlefield in Revolutionary War': 1,
 'battlefield in World War 1': 1,
 'battlefield in World War 2': 1,
 'beacon': 1,
 'bench at bus stop': 21,
 'bridge at mississippi river': 1,
 'bridge near club': 1,
 'bus stop': 1,
 'college gradua

In [238]:

multipleDict = dict((k, v) for k, v in compressedDict.items() if v > 1.0)
multipleDict

{"Bubba's Grave": 2,
 "Bubba's Home": 2,
 'Foursquare Church': 2,
 'Gump House': 17,
 "Jenny's grandma's trailer": 2,
 "Jenny's house": 2,
 'Military Base': 2,
 'White House': 7,
 'access-road': 9,
 'army bus': 2,
 'bar in New York': 2,
 'bench at bus stop': 21,
 'football field': 3,
 'football stadium': 2,
 'harbor': 4,
 'hospital dormitory': 2,
 'main street': 8,
 'on the boat': 8,
 'rain-swept camp': 2,
 'riverside': 2,
 'road through Monument Valley': 2,
 'road with bridge': 2,
 'school bus': 2,
 'street with houses': 2,
 'tree on a field': 6}

In [None]:
# make pandas table to only have ^ multiples (and mark which ones are which?)
# - need to narrow down data formatting to answer questions about representations 