In [2]:
### Important modules ###
from __future__ import absolute_import, division, print_function

import matplotlib.pylab as plt #No module named matplotlib.pylab.. sudo apt-get install python-matplotlib
import matplotlib.pyplot as plt2
import matplotlib.image as mpimg
import operator
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
import keras
import tensorflow_hub as hub # No module named tensorflow_hub -- pip install tensorflow-hub

from keras.applications import resnet50
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.imagenet_utils import decode_predictions
from tensorflow.keras import layers
import tensorflow.keras.backend as K
import PIL.Image as Image

Using TensorFlow backend.


In [None]:
### We classify users as personal or non personal users based on their marujana conversations
### We used a pretrained image classification model called Resnet to retrieve recognized elements from the images

### Resnet is a deep learning image classification model, built with imagenet dataset.
### It produces keywords of elements that it recognizes in the email along with probabilities.
### I have taken the top 20 keywords for each profile picture.

#Load the ResNet50 model
resnet_model = resnet50.ResNet50(weights='imagenet')
path_marujana_imgs="~/marujana_imgs"

imgInfo = {}

for filename in os.listdir(path_marujana_imgs):
    if filename.endswith(".jpg"):       
        userType = filename.split('.')[1]
        userName = filename.split('.')[0]
        try:
            original = load_img(filename, target_size=(224, 224))
            numpy_image = img_to_array(original)
            image_batch = np.expand_dims(numpy_image, axis=0)
            processed_image = resnet50.preprocess_input(image_batch.copy())

            predictions = resnet_model.predict(processed_image)
            label = decode_predictions(predictions, top=20) ## We get predictions for each profile image
            print(label)
            imgInfo[len(imgInfo)] = {'User Type': userType, 'Username': userName, 'Label': label}
            
        except:
            print("Error")

In [None]:
### We used Google’s pre-trained Word2Vec model to obtain the numerical embedding vector of the keywords that represent the profile pictures.

# Explore Google's huge Word2Vec model.

import gensim
import logging

# Logging code taken from http://rare-technologies.com/word2vec-tutorial/
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load Google's pre-trained Word2Vec model.
#model = gensim.models.Word2Vec.load_word2vec_format('~/GoogleNews-vectors-negative300.bin', binary=True)  

model = gensim.models.KeyedVectors.load_word2vec_format('~/path/to/GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
## Average each probability.
avgProbabilities = []
imgInfoDf = pd.DataFrame(imgInfo).T

### We averaged 20 probabilities for each image
for i in imgInfoDf['Label']:
    for j in range(0, 20):
        word = i[j][1]
        modelOutput += model(word)
    
    modelOutput = modelOutput / 20
    avgProbabilities.append(modelOutput)

imgInfoDf['Average Probabilities'] = avgProbabilities    

## np_users: non-personal
## p_users: personal
np_users = imgInfoDf[(imgInfoDf['User Type'] == 'I') | (imgInfoDf['User Type'] == 'I1') | 
                     (imgInfoDf['User Type'] == 'R') | (imgInfoDf['User Type'] == 'R1') |
                     (imgInfoDf['User Type'] == 'NA') | (imgInfoDf['User Type'] == 'NA1')]['Average Probabilities']

p_users = imgInfoDf[(imgInfoDf['User Type'] == 'P') | (imgInfoDf['User Type'] == 'P1')]

X = imgInfoDf['Average Probabilities']
y = []

for i in range(len(np_users)):
    y.append(0)

for j in range(len(p_users)):
    y.append(1)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)
# make class predictions for the testing set
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(metrics.classification_report(y_test, predictions))