In [None]:
import pandas as pd
import sys
import ast

import nltk

import gensim
from gensim.models import word2vec

In [None]:
#Change these variables to work with your setup
TEXT_FEATURES_FILE = "data2015/text_features/allText.csv" #text features
AVERAGE_IMAGE_FEATURES_FILE ="data2015/image_features/features_image_all_averages.csv" #average image features
IMAGE_FEATURES_FILE = "data2015/image_features/features_image_all.csv" #image features
PRETRAINED_WORD2VEC = '/Users/laura/software/word2vec/GoogleNews-vectors-negative300.bin' #pre-trained word2vec vectors

In [None]:
textFeatures = pd.read_csv(TEXT_FEATURES_FILE)
textFeatures = textFeatures.drop('Unnamed: 0',axis=1)

imageFeatures = pd.read_csv(AVERAGE_IMAGE_FEATURES_FILE)
imageFeatures = imageFeatures.drop("Unnamed: 0",axis=1)

individualImageFeatures = pd.read_csv(IMAGE_FEATURES_FILE)
individualImageFeatures = individualImageFeatures.drop("Unnamed: 0",axis=1)

In [None]:
#MICRO IEUS (e.g., combined_image_unigrams)
#First, combine all of the image attributes (colors, scenes, places, etc.)
#Then, run the unigram extractor from these image attributes

#MACRO IEUS (e.g., all_image_unigrams)
#First, run the unigram extractor for each individual image (given those attributes)
#Then, concatenate all of the unigrams together

In [None]:
#MICRO IEUS
colorLabels = ['black_norm', 'blue_norm', 'brown_norm', 'grey_norm', 'green_norm', 'orange_norm', 'pink_norm', 
          'purple_norm', 'red_norm', 'white_norm','yellow_norm']
colorNames = ['black','blue','brown','grey','green','orange','pink','purple','red','white','yellow']
placeLabels = [i for i in individualImageFeatures.columns.values if i[:6] == "PLACES"]
objLabels = [i for i in individualImageFeatures.columns.values if i[:8] == "IMAGENET"]

all_bow = []

for index,row in individualImageFeatures.iterrows():
    bow = []
    
    for i in range(len(colorLabels)):
        if row[colorLabels[i]] > 0.33:
            bow.append(colorNames[i])
            
    #Find max place
    maxValue = -1
    maxPlace = ""
    for place in placeLabels:
        if maxValue < 0 or row[place] > maxValue:
            maxValue = row[place]
            maxPlace = place
    if maxPlace == "PLACES baseball_field":
        maxPlace = "PLACES baseball field"
    bow += (maxPlace.split(' ')[1:])
    
    for obj in objLabels:
        if row[obj] > 0:
            bow += (obj[:-1].split(' ')[2:])
            
    all_bow.append(bow)
    
individualImageFeatures['combined_image_unigrams'] = all_bow
individualImageFeatures.to_csv(IMAGE_FEATURES_FILE)

In [None]:
#MACRO IEUS
#Concatenate all of the individual image unigrams vectors into a single unigram vector for each person
unigrams = {}

for index,row in individualImageFeatures.iterrows():
    _id = row['id']
    image_unigrams = row['combined_image_unigrams']
    
    if _id in unigrams:
        unigrams[_id] += image_unigrams
    else:
        unigrams[_id] = image_unigrams

In [None]:
all_image_unigrams = []
for _id in imageFeatures['id']:
    all_image_unigrams.append(unigrams[_id])
imageFeatures['all_image_unigrams'] = all_image_unigrams

In [None]:
#Same as above, but preserve which unigrams go with which images
unigrams = {}

for index,row in individualImageFeatures.iterrows():
    _id = row['id']
    imageNum = row['imageNum']
    image_unigrams = row['combined_image_unigrams']
    
    unigrams[_id + '_' + str(imageNum)[0]] = image_unigrams

In [None]:
image1_unigrams = []
image2_unigrams = []
image3_unigrams = []
image4_unigrams = []
image5_unigrams = []
for _id in imageFeatures['id']:
    if _id + '_1' in unigrams:
        image1_unigrams.append(unigrams[_id + '_1'])
    else:
        image1_unigrams.append([])
        
    if _id + '_2' in unigrams:
        image2_unigrams.append(unigrams[_id + '_2'])
    else:
        image2_unigrams.append([])
        
    if _id + '_3' in unigrams:
        image3_unigrams.append(unigrams[_id + '_3'])
    else:
        image3_unigrams.append([])
        
    if _id + '_4' in unigrams:
        image4_unigrams.append(unigrams[_id + '_4'])
    else:
        image4_unigrams.append([])
        
    if _id + '_5' in unigrams:
        image5_unigrams.append(unigrams[_id + '_5'])
    else:
        image5_unigrams.append([])

imageFeatures['image1_unigrams'] = image1_unigrams
imageFeatures['image2_unigrams'] = image2_unigrams
imageFeatures['image3_unigrams'] = image3_unigrams
imageFeatures['image4_unigrams'] = image4_unigrams
imageFeatures['image5_unigrams'] = image5_unigrams

In [None]:
imageFeatures.to_csv(AVERAGE_IMAGE_FEATURES_FILE)

In [None]:
#Merge text features and image features
textFeatures = textFeatures.merge(imageFeatures[['id','image1_unigrams','image2_unigrams', \
                                'image3_unigrams','image4_unigrams','image5_unigrams']],left_on='id',right_on='id')

In [None]:
#Tokenize unigrams
textFeatures['all_captions_all_image_unigrams_tokenized'] = [nltk.word_tokenize(i) for i in textFeatures['all_captions']] + \
    textFeatures['all_image_unigrams']

In [None]:
#Stem unigrams
lancaster = nltk.LancasterStemmer()
textFeatures['all_captions_all_image_unigrams_stemmed'] = [[lancaster.stem(token) for token in tokenizedList] for \
                                            tokenizedList in textFeatures['all_captions_all_image_unigrams_tokenized']]

In [None]:
#word2vec model
model = gensim.models.Word2Vec.load_word2vec_format(PRETRAINED_WORD2VEC, binary=True)  

In [None]:
textFeatures['all_captions_tokenized'] = [nltk.word_tokenize(i) for i in textFeatures['all_captions']]

In [None]:
#Word2vec - train on training data
all_embeddings = []
for tokenized_comment in textFeatures['all_captions_combined_image_unigrams_tokenized']:
    full_embedding = [0] * 300
    number = 0
    for token in ast.literal_eval(tokenized_comment):
        if token in model:
            embedding = model[token]
            number += 1
            for i in range(len(full_embedding)):
                full_embedding[i] += embedding[i]
    if number > 0:
        full_embedding = [i/number for i in full_embedding]
    all_embeddings.append(full_embedding)
textFeatures['all_captions_combined_image_unigrams_word2vec_averaged'] = all_embeddings

for i in range(300):
    textFeatures['all_captions_combined_image_unigrams_word2vec_averaged_' + str(i)] = \
        [embedding[i] for embedding in textFeatures['all_captions_combined_image_unigrams_word2vec_averaged']]

In [None]:
textFeatures.to_csv(TEXT_FEATURES_FILE)