The goal of this file is to combine the small segments of the forrest gump movie fmri run into a combined pandas dataframe

In [2]:
import pandas as pd
import os 
import pickle as pickle
import csv
import numpy as np

In [3]:
cwd = os.getcwd()
annDir = os.path.join(cwd, 'studyforrest-data-annotations')
segDir = os.path.join(annDir, 'segments', 'avmovie')

In [4]:
segDir

'C:\\Users\\elawl\\Documents\\GitHub\\NMA_project_\\NMA_project\\studyforrest-data-annotations\\segments\\avmovie'

In [5]:
exFn = "locations_run-4_events.tsv"
int(exFn[14])

4

In [6]:
def add_to_df(dataToAdd, DF):
    # ...
    DF_added = DF
    
    return DF_added

def get_locations_filename(run, segDir, anotType):
    if anotType == 'loc':
        filename = 'locations_run-' + str(run+1) + '_events.tsv'
    elif anotType == 'char':
        filename = 'emotions_av_1s_events_run-' + str(run+1) + '_events.tsv'
    filename = os.path.join(segDir, filename)
    
    return filename

In [7]:
#columns = ["onset", "duration", "major_location", "setting", "locale", "int_or_ext", "flow_of_time", "time_of_day", "run"]
movAnnotations = pd.DataFrame()

runNum = 8
length = 0
for run in range(runNum):
    # open each run
    filenameL = get_locations_filename(run, segDir, 'loc')
    currentSeg = pd.read_csv(filenameL, delimiter='\t')
    
    # add a column for the corresponding run
    numberOfRows = len(currentSeg)
    runArray = np.ones(numberOfRows)*(run+1) 
    currentSeg['run'] = runArray
    
    length += len(currentSeg)

    # add to a main DF 
    movAnnotations = pd.concat([movAnnotations, currentSeg], ignore_index=True)

In [8]:
movAnnotations.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
4,286.32,14.24,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0


## Notes
We want to look for places and people (and possibly time jumps)
- do we subset to a list of places we care about?
- how do we label so they're the same across runs
    - pick locations (of interest) and give them a number system
    - replace with 0, -, + (code for things that happend last -> replacing 0s to have a scale of relevant time in FG's life
    
Data
- major_setting, setting, locale
- very broad, a bit more fine grain, very fine grain
- also have "ext"/"int"; "night"/"day"

Scale of location:
- for most of our questions, we'll want to look at "setting"
- but we can also ask how physical distance may be encoded using "major_setting"

Locations
- reduce them to locations that occur in between runs 
- ex: run 1 has 15 locations that don't appear in run 2 etc.
- is there enough temporal time between locations to look at them?

In [9]:
# Code flow of time to get a coarse coding of how much time has passed

Next steps:
* group "setting" within the same run to sum the duration 
* reducing events -> finding locations that occur multiple times (and in different scenes) 

Helpful way to structure annotations:
* 

In [10]:
def get_unique_value_across_runs(dataFrame, columnLabel, runNum):
    unique_values_across_runs = []
    
    for run in range(runNum):
        unique_runs = pd.unique(dataFrame[dataFrame['run']==run][columnLabel])
        unique_values_across_runs.extend(unique_runs)
    return unique_values_across_runs

def get_unique_counts(count_array):
    unique, counts = np.unique(count_array, return_counts=True)
    uniqueDict = dict(zip(unique, counts))
    return uniqueDict

In [11]:
unique_settings = get_unique_value_across_runs(movAnnotations, 'setting', runNum)
locationDict = get_unique_counts(unique_settings)


In [12]:
all_unique_major = get_unique_value_across_runs(movAnnotations, 'major_location', runNum)
majorDict = get_unique_counts(all_unique_major)

In [13]:
compressedTimes = pd.DataFrame(columns= movAnnotations.columns)
compressedTimesIndex = 0 

# if setting = previous setting -> add them together and continue iteration 
for i, row in movAnnotations.iterrows( ):
    
    if i == 0:
        compressedTimes = compressedTimes.append(row)
            
    else:
        if row['setting'] == compressedTimes.iloc[compressedTimesIndex]['setting']:
            compressedTimes.iat[compressedTimesIndex,1] = row['duration'] + compressedTimes.iloc[compressedTimesIndex]['duration']
            
        else:
            compressedTimes = compressedTimes.append(row)
            compressedTimesIndex += 1

            
            

In [14]:
compressedTimes.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0
3,272.2,39.76,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0
6,311.96,6.32,Greenbow Alabama,main street,crossroads,ext,+,day,1.0


In [15]:
np.min(compressedTimes['duration'])

1.16

In [16]:
compressedTimes[compressedTimes['duration']==np.min(compressedTimes['duration'])]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run
300,559.96,1.16,Europe,battlefield in World War 1,battlefield in World War 1,ext,++,day,3.0


In [17]:

compressedDict = get_unique_counts(compressedTimes['setting'])

In [18]:

multipleDict = dict((k, v) for k, v in compressedDict.items() if v > 2.0)
print(multipleDict)
print(len(multipleDict))

{'Gump House': 17, 'White House': 7, 'access-road': 9, 'bench at bus stop': 21, 'football field': 3, 'harbor': 4, 'main street': 8, 'on the boat': 8, 'tree on a field': 6}
9


In [19]:
# make pandas table to only have ^ multiples (and mark which ones are which?)
# - need to narrow down data formatting to answer questions about representations 

## Add Characters to our movAnnotations DF
* first we'll need to load in the character information 
* then we'll need to add in time points that correspond to major characters
* then we'll need to splice the two (characters DF and movAnnotations) together


In [20]:
emAnnotations = pd.DataFrame()

for run in range(runNum):
    # open emotional annotations to glean out major character information
    
    filename = get_locations_filename(run, segDir, 'char')
    currentSeg = pd.read_csv(filename, delimiter='\t')
    
    # add a column for the corresponding run
    numberOfRows = len(currentSeg)
    runArray = np.ones(numberOfRows)*(run+1) 
    currentSeg['run'] = runArray
    
    length += len(currentSeg)

    # add to a main DF 
    emAnnotations = pd.concat([emAnnotations, currentSeg], ignore_index=True)
    

In [21]:
emAnnotations.head()

Unnamed: 0,onset,duration,character,arousal,valence_positive,valence_negative,c_audio,c_context,c_face,c_gesture,...,e_love,e_pity/compassion,e_pride,e_relief,e_remorse,e_resent,e_sadness,e_satisfaction,e_shame,run
0,192.0,12.0,FORREST,-0.666667,0.666667,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,204.0,2.0,FORREST,-0.777778,0.777778,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,206.0,3.0,FORREST,-0.666667,0.888889,0.0,0.222222,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,209.0,16.0,FORREST,-0.333333,0.555556,0.0,0.111111,0.0,0.333333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,225.0,15.0,FORREST,-0.555556,0.555556,0.0,0.111111,0.0,0.222222,0.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
majorCharacters = ['MRSGUMP', 'JENNY', 'FORREST', 'BUBBA', 'DAN']

In [23]:
majorCharDf = emAnnotations[emAnnotations['character'].isin(majorCharacters)]

In [24]:
majorCharDf.head()
# 1043 total rows

Unnamed: 0,onset,duration,character,arousal,valence_positive,valence_negative,c_audio,c_context,c_face,c_gesture,...,e_love,e_pity/compassion,e_pride,e_relief,e_remorse,e_resent,e_sadness,e_satisfaction,e_shame,run
0,192.0,12.0,FORREST,-0.666667,0.666667,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,204.0,2.0,FORREST,-0.777778,0.777778,0.0,0.111111,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,206.0,3.0,FORREST,-0.666667,0.888889,0.0,0.222222,0.0,0.444444,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,209.0,16.0,FORREST,-0.333333,0.555556,0.0,0.111111,0.0,0.333333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,225.0,15.0,FORREST,-0.555556,0.555556,0.0,0.111111,0.0,0.222222,0.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
# make a copy of movAnnotations for now to mess around with the character information 
movCopy = movAnnotations.copy()
skellyChar = np.zeros(len(movCopy))
skellyChar = [str(int(i)) for i in skellyChar]

movCopy['character'] = skellyChar
movCopy.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0,0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,0
3,272.2,14.12,Greenbow Alabama,doctor's office,doctor's office,int,-,day,1.0,0
4,286.32,14.24,Greenbow Alabama,doctor's office,doctor's office,int,0,day,1.0,0


In [26]:
newIdx = len(movCopy)

for i, row in majorCharDf.iterrows():
    
    thisRun = movCopy[movCopy['run']==row['run']]
    
    # find the nearest onset to the character onset in movAnnotations
    orderByOnset = thisRun.iloc[(thisRun['onset']-row['onset']).abs().argsort()[:2]]
    nearestOnsetIdx = orderByOnset.index.tolist()

    
    # add the character row to movCopy using the location data from the nearest onset 
    infoToAdd = movCopy.iloc[nearestOnsetIdx[0]]
    
    #add row
    movCopy.loc[newIdx] = movCopy.iloc[nearestOnsetIdx[0]]

    
    # change the character value 
    movCopy.iat[newIdx, 9] = row['character']
    movCopy.iat[newIdx, 0] = row['onset']
    movCopy.iat[newIdx, 1] = row['duration']
    
    newIdx += 1

In [27]:
test = movCopy.groupby('run').apply(lambda x:x.sort_values('onset'))

In [28]:
test = test.reset_index(drop=True)

In [29]:
test.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0,0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,0
3,192.0,12.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST
4,204.0,2.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST


# NOTE: WE NEED TO OFFSET ONSET BY THE RUN THEY'RE IN

In [30]:
len(movCopy) - len(pd.unique(movCopy['onset']))


# Run 1 -> 2 repeats of onset
# Run 2 -> 12
# Run 3 -> 5
# 4-> 8
# 5 -> 13
# 6-> 6
# 7-> 11
# 8 -> 8

123

In [31]:
# A = movCopy[movCopy['run']==4.0]

# #examples of which onsets are duplicates 
# counts_of_onset = get_unique_counts(A[['onset']])
# duplicates =  dict((k, v) for k, v in counts_of_onset.items() if v > 1.0)
# A[A['onset'].isin(duplicates.keys())]




## Section: Organizing data to reflect TRs
Make a combined DF where we will have a row for every TR with 1) annotations of setting(/location) and major characters

In [58]:
movCopy = test.copy()

#list of all the lengths of runs
lengthOfRun = []
for allrun in range(runNum):
    helper_mov = movCopy[movCopy['run'] == allrun+1]
    #max_row = helper_mov[helper_mov['onset']==helper_mov['onset'].max()]
    max_row = helper_mov[helper_mov['onset']==helper_mov['onset'].max()]
    rough_max = np.ceil(max_row['duration'].max() + max_row['onset'].max())
    rough_max += rough_max%2
     
    lengthOfRun.append(max_row['duration'].max() + max_row['onset'].max())

#create an empty array the length of the run 
new_times = np.zeros(len(movCopy))

#create array of summed secons from previous runs, and add a zero at the beginning so you don't add anything to the first run
offset_onsets = np.cumsum(lengthOfRun)
offset_onsets = np.insert(offset_onsets,0,0)


#add whatever currrent value 
for every_onset in range(len(new_times)):
    new_times[every_onset] = movCopy['onset'].iloc[every_onset]+offset_onsets[int(movCopy['run'].iloc[every_onset])-1]

#add the new onset times to the dataframe
movCopy['cont_onset'] = new_times
movCopy.tail()



Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset
1907,603.56,10.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7043.0
1908,611.56,60.2,Gump property,access-road,at the mail boxes,ext,0,day,8.0,0,7051.0
1909,613.56,1.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7053.0
1910,614.56,3.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7054.0
1911,617.56,3.0,Gump property,access-road,at the mail boxes,ext,0,day,8.0,FORREST,7057.0


In [59]:
movCopy.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0,0.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0,0,17.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,0,168.08
3,192.0,12.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST,192.0
4,204.0,2.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST,204.0


In [60]:
helper_mov['onset'].max()


617.56

In [61]:
helper_mov[helper_mov['onset']==883.0]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character


### Make sure the continuous onset times make sense with the annotations/ data!!

In [62]:
offset_onsets

array([   0.  ,  886.  , 1757.  , 2619.92, 3586.16, 4501.96, 5370.8 ,
       6439.44, 7060.  ])

### Adjust characters to binary scores

In [67]:
#add major characters as columns
emptyChar = np.zeros(len(movCopy))


#added the empty rows
for achar in majorCharacters:
    movCopy[achar] = emptyChar

# loop through rows and score people
for i, row in movCopy.iterrows():
    if row['character'][0] == 'F':
        movCopy['FORREST'][i] = 1
    elif row['character'][0] == 'M':
        movCopy['MRSGUMP'][i] = 1
    elif row['character'][0] == 'J':
        movCopy['JENNY'][i] = 1
    elif row['character'][0] == 'D':
        movCopy['DAN'][i] = 1
    elif row['character'][0] == 'B':
        movCopy['BUBBA'][i] = 1
    elif row['character'] != '0':
        print(row['character'])
        
print('done!')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


done!


In [69]:
tripleOnsets

[695, 696, 697]

In [70]:
movCopy.iloc[tripleOnsets]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN
695,48.84,6.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,FORREST,2668.76,0.0,0.0,1.0,0.0,0.0
696,48.84,5.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,DAN,2668.76,0.0,0.0,0.0,0.0,1.0
697,48.84,1.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,BUBBA,2668.76,0.0,0.0,0.0,1.0,0.0


In [52]:
movCopy = test.copy()

In [57]:
test.iloc[tripleOnsets]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character
695,48.84,6.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,FORREST
696,48.84,5.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,DAN
697,48.84,1.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,BUBBA


In [71]:
#backup = movCopy.copy()

In [72]:
rowsToFix = movCopy.iloc[tripleOnsets]
# first find min of the duration

shortestIdx = rowsToFix['duration'].idxmin()
movCopy.iloc[shortestIdx, 13] = 1.0
movCopy.iloc[shortestIdx, 15] = 1.0

secondShortestIdx = 696
movCopy.iat[secondShortestIdx, 1] -= movCopy.iloc[shortestIdx]['duration']
movCopy.iat[secondShortestIdx, 0] += movCopy.iloc[shortestIdx]['duration']

# second shortest
movCopy.iloc[secondShortestIdx, 13] = 1.0

finallyLast = 695
movCopy.iat[finallyLast, 1] -= movCopy.iloc[secondShortestIdx]['duration']
movCopy.iat[finallyLast, 0] += movCopy.iloc[secondShortestIdx]['duration']


# shortestDur = min(movCopy.iloc[tripleOnsets[0]]['duration'], movCopy.iloc[tripleOnsets[1]]['duration'], movCopy.iloc[tripleOnsets[2]]['duration'])
# shortestDurRow = rowsToFix[rowsToFix['duration'] == shortestDur]
# movCopy.iloc[697]['FORREST'] = 

In [73]:
movCopy.iloc[tripleOnsets] 

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN
695,52.84,2.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,FORREST,2668.76,0.0,0.0,1.0,0.0,0.0
696,49.84,4.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,DAN,2668.76,0.0,0.0,1.0,0.0,1.0
697,48.84,1.0,Vietnam,embattled jungle,embattled embankment,ext,0,day,4.0,BUBBA,2668.76,0.0,0.0,1.0,1.0,1.0


In [136]:
doubleOnsets[:10]
len(doubleOnsets)

18

In [137]:
movCopy.iloc[doubleOnsets]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN
747,120.84,0.0,Vietnam,embattled jungle,in the embattled jungle,ext,0,day,4.0,BUBBA,2734.76,0.0,0.0,1.0,1.0,0.0
748,120.84,2.0,Vietnam,embattled jungle,in the embattled jungle,ext,0,day,4.0,FORREST,2740.76,0.0,0.0,1.0,0.0,0.0
959,954.84,0.0,Washington D.C.,Lincoln Memorial,between Lincoln Memorial and Reflecting Pool,ext,0,day,4.0,JENNY,3571.76,0.0,1.0,0.0,0.0,0.0
960,954.84,11.0,Washington D.C.,Lincoln Memorial,between Lincoln Memorial and Reflecting Pool,ext,0,day,4.0,FORREST,3574.76,0.0,1.0,1.0,0.0,0.0
1020,133.8,0.0,Washington D.C.,White House,front of White House,ext,+,night,5.0,JENNY,3718.96,0.0,1.0,0.0,0.0,0.0
1021,133.8,1.0,Washington D.C.,White House,front of White House,ext,+,night,5.0,JENNY,3719.96,0.0,1.0,1.0,0.0,0.0
1069,253.8,0.0,Washington D.C.,bus stop,bus stop,ext,0,day,5.0,FORREST,3838.96,0.0,1.0,1.0,0.0,0.0
1070,253.8,1.0,Washington D.C.,bus stop,bus stop,ext,0,day,5.0,JENNY,3839.96,0.0,1.0,0.0,0.0,0.0
1398,524.72,1.0,Bayou La Batre,on the boat,on the boat,ext,++,day,6.0,FORREST,5025.68,0.0,0.0,1.0,0.0,0.0
1399,524.72,6.0,Bayou La Batre,on the boat,on the boat,ext,++,day,6.0,DAN,5026.68,0.0,0.0,0.0,0.0,1.0


In [82]:
def pairwise(iterable):
    "s -> (s0, s1), (s2, s3), (s4, s5), ..."
    a = iter(iterable)
    return zip(a, a)

for x, y in pairwise(doubleOnsets[:10]):
    print("%d, %d" % (x, y))

78, 79
148, 149
199, 200
240, 241
275, 276


In [92]:
#movCopy[majorCharacters]

In [114]:
tryA = movCopy.iloc[78]
char = tryA[majorCharacters]
char[char==1.0]

tryB = movCopy[majorCharacters].idxmax(axis=1)
tryB.iloc[78]

'FORREST'

In [176]:
# combine multiple characters 
indices = movCopy.index.tolist()
count = 0

doubleOnsets = []
#tripleOnsets = []

for i, index in enumerate(indices):
    
    # i is the index in our loop
    # index[1] is the current index of our dataframe
    if i < len(indices)-1:
        thisRow = movCopy.iloc[index]
        nextRow = movCopy.iloc[index+1]
        #nextNextRow = movCopy.iloc[index+2]
        
        if thisRow['onset'] == nextRow['onset']:# and thisRow['onset']==nextNextRow['onset']:
            doubleOnsets.extend([index, index+1])
            #tripleOnsets.extend([index, index+1, index+2])

In [177]:
len(doubleOnsets)

130

### loop through all pairs with the same onset and combine them

In [175]:
movCopy = backup.copy()

In [169]:
movCopy.head()

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN
0,0.0,17.0,Paramount,mountain logo,mountain logo,ext,0,day,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,17.0,151.08,Savannah,sky over Savannah,sky over Savannah,ext,++,day,1.0,0,17.0,0.0,0.0,0.0,0.0,0.0
2,168.08,104.12,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,0,168.08,0.0,0.0,0.0,0.0,0.0
3,192.0,12.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST,192.0,0.0,0.0,1.0,0.0,0.0
4,204.0,2.0,Savannah,bench at bus stop,bench at bus stop,ext,0,day,1.0,FORREST,204.0,0.0,0.0,1.0,0.0,0.0


In [178]:
# make dictionary for the row numbers of the characters 
charDict = {'MRSGUMP': 11, "JENNY":12, "FORREST": 13, "BUBBA":14, "DAN":15}
# MRs - 11 Junny - 12 Forrest - 13 Bubba - 14 Dan - 15

# paste how to calc doubleOnsets
# combine multiple characters 
loopCount = 0 
while len(doubleOnsets) > 0 and loopCount < 100:
    print(len(doubleOnsets))
    for row1, row2 in pairwise(doubleOnsets):
    # for pairs of onsets in doubleOnsets
        rowsToFix = movCopy.iloc[[row1, row2]]
        shortestIdx = rowsToFix['duration'].idxmin()
        
        

        longestIdx = np.setdiff1d([row1, row2],[shortestIdx])
        longestIdx = longestIdx[0]

        # find the characters that are needed to add 
        longestRow = movCopy.iloc[longestIdx]

        characters = movCopy[majorCharacters].idxmax(axis=1)
        character = characters.iloc[longestIdx] 
        characterCol = charDict[character]

        # add characters to the shortest duration
        movCopy.iat[shortestIdx, characterCol] = 1.0

        # offset longest by shortest duration and onset
        # duration
        movCopy.iat[longestIdx, 1] -= movCopy.iloc[shortestIdx]['duration']
        # onset
        movCopy.iat[longestIdx, 0] += movCopy.iloc[shortestIdx]['duration']
    
    if movCopy.iloc[shortestIdx]['duration'] == 0:
        # drop this row 
        movCopy = movCopy[movCopy['duration']> 0.0]
            
    movCopy = movCopy.groupby('run').apply(lambda x:x.sort_values('onset'))
    movCopy = movCopy.reset_index(drop=True)
        
    
    doubleOnsets=[]
    indices = movCopy.index.tolist()
    for i, index in enumerate(indices):
    
    # i is the index in our loop
    # index[1] is the current index of our dataframe
        if i < len(indices)-1:
            thisRow = movCopy.iloc[index]
            nextRow = movCopy.iloc[index+1]


            if thisRow['onset'] == nextRow['onset']:
                doubleOnsets.extend([index, index+1])
                
    loopCount += 1
           


130
112
66
48
44
40
36
36
34
32
14
10
8
8
8
6
6
6
6
6
6
6
6
6
6
6
4


In [179]:
movCopy.iloc[[78, 79, 80]]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN
78,580.0,2.0,Gump property,Gump House,Forrest's bedroom,int,+,night,1.0,MRSGUMP,580.0,1.0,0.0,1.0,0.0,0.0
79,582.0,1.0,Gump property,Gump House,Forrest's bedroom,int,+,night,1.0,MRSGUMP,582.0,1.0,0.0,1.0,0.0,0.0
80,583.0,3.0,Gump property,Gump House,Forrest's bedroom,int,+,night,1.0,FORREST,580.0,0.0,0.0,1.0,0.0,0.0


In [180]:
backup.iloc[363:376]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN
363,458.0,3.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1344.0,0.0,0.0,1.0,0.0,0.0
364,461.0,1.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1347.0,0.0,1.0,0.0,0.0,0.0
365,461.0,18.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1347.0,0.0,0.0,1.0,0.0,0.0
366,462.0,7.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1348.0,0.0,1.0,0.0,0.0,0.0
367,469.0,4.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1355.0,0.0,1.0,0.0,0.0,0.0
368,473.0,5.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1359.0,0.0,1.0,0.0,0.0,0.0
369,478.0,2.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1364.0,0.0,1.0,0.0,0.0,0.0
370,479.0,5.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1365.0,0.0,0.0,1.0,0.0,0.0
371,480.0,8.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1366.0,0.0,1.0,0.0,0.0,0.0
372,484.0,3.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1370.0,0.0,0.0,1.0,0.0,0.0


In [181]:
movCopy.iloc[360:376]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN
360,456.0,1.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1342.0,0.0,0.0,1.0,0.0,0.0
361,457.0,1.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1343.0,0.0,0.0,1.0,0.0,0.0
362,458.0,3.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1344.0,0.0,0.0,1.0,0.0,0.0
363,461.0,1.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1347.0,0.0,1.0,1.0,0.0,0.0
364,462.0,7.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1348.0,0.0,1.0,1.0,0.0,0.0
365,469.0,4.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1355.0,0.0,1.0,1.0,0.0,0.0
366,473.0,5.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1359.0,0.0,1.0,1.0,0.0,0.0
367,478.0,1.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1347.0,0.0,1.0,1.0,0.0,0.0
368,479.0,1.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,JENNY,1364.0,0.0,1.0,1.0,0.0,0.0
369,480.0,4.0,University of Alabama,Margaret Mitchell Hall,Margaret Mitchell Hall,ext,0,night,2.0,FORREST,1365.0,0.0,1.0,1.0,0.0,0.0


In [125]:
unique_cont, counts_cont = np.unique(movCopy['onset'], return_counts=True)

In [126]:
counts_cont

array([2, 1, 1, ..., 1, 1, 1], dtype=int64)

In [128]:
len(unique_cont) - len(movCopy)

-113

In [None]:
# ##reduce rows
# unique_cont, counts_cont = np.unique(movCopy['cont_onset'], return_counts=True)


# lilcopy = movCopy[movCopy['run']==3]
# #newonsets = [241.92,824.92]
    
    
    
# #an attempt to reduce the labeling and then the movCopy. Does not work :(
# for onsets in unique_cont: # this loops 
#     pract_rows = movCopy[movCopy['onset'] == onsets] 
#     each_one = pract_rows[['MRSGUMP','JENNY','FORREST','BUBBA','DAN']].sum()
#     for i, row in pract_rows.iterrows():
#         #callind = pract_rows.index[arow][1]
#         movCopy.iat[i[1], 11] = each_one[0]
#         movCopy.iat[i[1], 12] = each_one[1]
#         movCopy.iat[i[1], 13] = each_one[2]
#         movCopy.iat[i[1], 14] = each_one[3]
#         movCopy.iat[i[1], 15] = each_one[4]


In [None]:
find_nearest_time(4.0, movCopy, 'onset')[0:2]

## Number of TRs

In [None]:
def get_info_for_TR(dataFrame, currentTR):
    nearestOnset = find_nearest_time(TR, dataFrame, 'onset')
    
    # make sure that nearestOnset is less than currentTR
    
    lengthOfTR = 2.0 
    
    # want everything that occurs at indexing time starts
    
    
    
    return rowOfDataFrame

In [64]:
import decimal

def float_range(start, stop, step):
    while start < stop:
        yield float(start)
        start += decimal.Decimal(step)

print(list(float_range(0, 1, '0.1')))

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


In [65]:
def find_nearest_time(time1, secondDf, DfCol):
    orderByNearest = secondDf.iloc[(secondDf[DfCol]-time1).abs().argsort()]
    
    return orderByNearest.index.tolist()

In [66]:
TRs = np.array(list(float_range(0, 40, 2.0)))
TRs

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20., 22., 24.,
       26., 28., 30., 32., 34., 36., 38.])

In [188]:
TRsByRun = [451,441,438,488,462,439,542,338]

TR_list = np.array(list(float_range(0, TRsByRun[0], 2.0)))
TR_index = range(TRsByRun[0])


In [185]:
output1 = enumerate(TR_list)

### Big bad loop

In [199]:
# add column to TR_table to mark changes in location within a TR
movCopy['locChange'] = np.zeros(len(movCopy))

#making an empty table with the same columns
TR_table = pd.DataFrame(columns = movCopy.columns)

In [None]:
# current_TRs = 0
# TR_index = 0

# for i, row in small_ann.iterrows():
#     duration = row['duration']
#     onset = row['onset']
    leftover = duration%2
    for tr_step in range(int(np.floor(duration/2))):
        TR_table.loc[TR_index] = row
        TR_table.iat[TR_index, 11] = TR_index*2.0
        
        TR_index += 1


In [198]:
movCopy.iloc[200:205]

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN,locChange
200,24.48,0.8,Gump property,access-road,alley to Gump House,ext,0,day,2.0,0,910.48,0.0,0.0,0.0,0.0,0.0,1.0
201,25.0,1.0,Gump property,access-road,alley to Gump House,ext,0,day,2.0,JENNY,911.0,0.0,1.0,1.0,0.0,0.0,1.0
202,25.28,1.44,Gump property,access-road,alley to Gump House,ext,0,day,2.0,0,911.28,0.0,0.0,0.0,0.0,0.0,1.0
203,26.0,1.0,Gump property,access-road,alley to Gump House,ext,0,day,2.0,JENNY,912.0,0.0,1.0,1.0,0.0,0.0,1.0
204,26.72,2.08,Gump property,access-road,alley to Gump House,ext,0,day,2.0,0,912.72,0.0,0.0,0.0,0.0,0.0,1.0


In [195]:
# loop by run
for run in range(1):#runNum):
    thisRunsTRs = np.array(list(float_range(0, TRsByRun[run], 2.0)))
    thisRunsTRIdx = range(TRsByRun[run])
    # create list of TRs that is thisRunsTRs - 1
    TR_helper_list = thisRunsTRs - np.ones(len(thisRunsTRs))
    
    currentTR = thisRunsTRs[0]
    
    thisRun = movCopy[movCopy['run']==run+1.0]
    
    for i, row in thisRun.iterrows():
        onset = row['onset']
        duration = row['duration']
        if onset > currentTR:
            print('what the heck')
        else:
            # add this onset's information to the TR
            
            if duration < 2.0:
                # we need to add multiple onsets for one TR
                
                # we need to combine multiple rows from movCopy into one TR row 
                
            else:
                # we need to add multiple TRs for one onset 
                while duration > 0.0:
                    # add TRs and subtract 2.0 from duration 
            
            

    
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-195-e91004ac0233>, line 24)

In [192]:
thisRun

Unnamed: 0,onset,duration,major_location,setting,locale,int_or_ext,flow_of_time,time_of_day,run,character,cont_onset,MRSGUMP,JENNY,FORREST,BUBBA,DAN


In [None]:

TR_table.head()

In [None]:
duration = 26.8
leftover = duration%2
leftover

In [None]:
addZerosForTR = np.zeros(len(movCopy))
movCopy['TR'] = addZerosForTR

In [None]:
movCopy.head()

In [None]:
columns = movCopy.columns
columns = [columns[i] for i in range(len(columns))]
columns.extend(['TR'])
TRdf = pd.DataFrame(columns=columns)

TRidx = 0

for TR in TRs:
    # do some stuff
    
    # find nearest timepoint from our movAnnotations
    nearestTime = find_nearest_time(TR, movCopy, 'onset')
    timeIdx = nearestTime[0]
    print(TR, movCopy['onset'][timeIdx])
    
    if TR == movCopy['onset'][timeIdx]:
        print('this would be great if this was true for all TRs')
    else:
        print("we will need to align the time")
    
    
    # 10th column?
    TRdf.loc[TRidx] = movCopy.iloc[nearestOnsetIdx[0]]
    TRidx += 1

In [None]:
TRdf