In [3]:
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
import numpy as np
import pandas as pd
import glob
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split

In [4]:
#load profit/loss data
profits_tbl = pd.read_csv('stock_profits2.csv')

#load embedding CNN model
base_model = VGG19(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

#load list of all images
fileList = glob.glob("faces/*/*.jpg")

counter=0
#run through the images and create dataframe of embeddings and profit/loss
for img_path in fileList:
    try:
        counter+=1
        folder = img_path.split("/")[1]
        searchTerms = ["CEO","ChiefExecutiveOfficer","CEOprofilephoto","topexecutive","profilephoto"]
        for term in searchTerms:
            folder=folder.replace(term,"")
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        fc1 = model.predict(x)
        embedding = fc1[0]
        ticker = folder
        
        parray = profits_tbl.loc[profits_tbl['Symbol'] == ticker]["Profit"]
        profit=int(parray)
        
        
        with open("embeddings.csv", "a") as f:
            row = pd.DataFrame([ticker, profit, img_path] + list(embedding)).T
            row.to_csv(f, header=False, index=False)
        
        if counter % 100 ==1:
            print(len(fileList),"tasks",counter,"done",100*round(counter/float(len(fileList)),3),"%")
#             print(ticker,profit,img_path,embedding)
        
        
    except Exception as e:
#         print(e)
        pass

In [8]:
profit_frame = pd.read_csv('embeddings.csv', header=None)
print(profit_frame.shape)
## ticker, profit, filename, <embedding.....>]

In [9]:
X = np.array(profit_frame.iloc[:,3:])
y = np.array(profit_frame.iloc[:,1])
print(X[:3], X.shape)
print(y[:3], y.shape)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=50,
          batch_size=128, 
          class_weight=class_weights)
score = model.evaluate(X_test, y_test, batch_size=128)
print(score)

In [None]:
predicted = pd.DataFrame(np.around(model.predict(X_test), decimals=0))
true = pd.DataFrame(y_test)

In [None]:
compare = pd.concat([predicted, true], axis=1)
compare.columns = ['predicted', 'actual_profit']
compare['true positive'] = compare['predicted'] * compare['actual_profit']
compare['true negative'] = (1 - compare['predicted']) * (1 - compare['actual_profit'])
compare['false positive'] = compare['predicted'] * (1 - compare['actual_profit'])
compare['false negative'] = (1 - compare['predicted']) * compare['actual_profit']
display(compare[:10])
print(compare.sum()[2:])