In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import re
from ast import literal_eval
import PIL
import tensorflow as tf

In [2]:
user_look_up = pd.read_csv("user-summary.csv")

Note to self: turn site look up table creation stuff into an automated process

In [3]:
def create_site_look_up(user_id):
    '''
    Turns the commands file into a list of websites 
    that the simulated user visited and the asssociated
    suffixes in the image file names
    
    Naming convention works wether you use the 
    screenshot parts or the full image that has 
    been stitched together from the parts
    
    ARGS:
    user_id = used to identify the correct commands file 
    
    NOTE TO SELF: 
    asssumes everyone has the same path to the commands file
    come back to file paths 
    '''
    commands = pd.read_csv(f"users/{user_id}/screenshot_0/commands_u{user_id[5:]}_s0.tsv", sep = "\t", header = None, index_col = 0).reset_index(drop = True)
    commands.columns = ['action', 'site']
    commands['site'] = commands.site.apply(literal_eval)
    
    suffix_to_site = {}
    for idx, _ in commands.groupby(commands.index // 2):
        if commands.iloc[idx * 2]['action'] == 'GET' and commands.iloc[idx * 2 + 1]['action'] == 'FULL SCREENSHOT':
            site = commands.iloc[idx * 2]['site']['url']
            suffix = commands.iloc[idx * 2 + 1]['site']['suffix'][1:]
            suffix_to_site[suffix] = site
            
    site_look_up = pd.DataFrame.from_dict(suffix_to_site, orient = 'index').reset_index()
    site_look_up.columns = ['suffix', 'site']
    
    return site_look_up

In [4]:
site_look_up = create_site_look_up('user_1000')

In [5]:
os.chdir("users")

In [6]:
class ImageData():
    def __init__(self, user_id, fname, grayscale = True, user_look_up = user_look_up, site_look_up = site_look_up):
        '''
        Class for screenshot data
        Mostly a convenience thing
        
        ARGS: 
        user_id = in the format 'user-####' (there can be any number of digits)
        fname = image file name
        grayscale = boolean value for wether the image should grayscale or RGB
        user_look_up = Pandas dataframe that associates user id's with the 
                       simulated user's protected class
        site_look_up = Panndas dataframe that associates the suffix used in the 
                       image filename (fname) with the website that was visited
                       
        NOTE TO SELF:
        Check up on file name stuff
        
        may want to modify so that you can change back and forth between 
        RGB and grayscale? revisit this 
        '''
        #metadata 
        self.user = user_id 
        self.path = os.path.join(user_id, "screenshot_0/data/screenshots/parts", fname)
        self.grayscale = grayscale
        
        #load in image
        if self.grayscale: 
            self.img = np.asarray(PIL.Image.open(self.path).convert('L'))
            self.show = PIL.ImageOps.grayscale(PIL.Image.open(self.path))
        elif not self.grayscale:
            self.img = np.asarray(PIL.Image.open(self.path))
            self.show = PIL.Image.open(self.path)
            
        self.shape = self.img.shape
        
        #metadata from look up tables
        self.user_class = user_look_up[user_look_up.user_id == int(user_id[5:])]['group'].values[0]
        self.site = site_look_up[site_look_up.suffix == re.findall("-_(\d{1,2})-part", fname)[0]]['site'].values[0]
        

    def resize(self, w, h):
        '''
        Resizes the image
        Mutates objects .show, .img & .shape attributes
        Returns None
        
        ARGS:
        w = width of the image's new shape
        h = height of the image's new shape
        
        Note that upsampling works — the 
        reshaped image can have larger dimensions 
        than the original 
        
        NOTE TO SELF:
        Doesn't work for non RGB images! fix this!
        '''
        self.show = self.show.resize((w, h))
        self.img = np.array(self.show) 
        self.shape = self.img.shape
        

In [7]:
user_id = 'user_1000'
fname = '283131688007233-f8efc08be8aa3e9c2fe11cbeec2e03f6-_36-part-1-815.png'

In [8]:
test = ImageData(user_id, fname, grayscale = True)

In [144]:
str(test.img)

'[[ 37  37  37 ... 243 243 243]\n [ 37  37  37 ... 243 243 243]\n [ 37  37  37 ... 243 243 243]\n ...\n [255 255 255 ... 243 243 243]\n [255 255 255 ... 243 243 243]\n [255 255 255 ... 243 243 243]]'

In [117]:
test.resize(683, 1366)

In [119]:
test.shape

(1366, 683)

In [127]:
test2 = ImageData(user_id, fname, grayscale = False)

In [129]:
test2.img

array([[[  0,  36, 139, 255],
        [  0,  36, 139, 255],
        [  0,  36, 139, 255],
        ...,
        [243, 243, 243, 255],
        [243, 243, 243, 255],
        [243, 243, 243, 255]],

       [[  0,  36, 139, 255],
        [  0,  36, 139, 255],
        [  0,  36, 139, 255],
        ...,
        [243, 243, 243, 255],
        [243, 243, 243, 255],
        [243, 243, 243, 255]],

       [[  0,  36, 139, 255],
        [  0,  36, 139, 255],
        [  0,  36, 139, 255],
        ...,
        [243, 243, 243, 255],
        [243, 243, 243, 255],
        [243, 243, 243, 255]],

       ...,

       [[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ...,
        [243, 243, 243, 255],
        [243, 243, 243, 255],
        [243, 243, 243, 255]],

       [[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ...,
        [243, 243, 243, 255],
        [243, 243, 243, 255],
        [243, 243, 243, 255]],

       [[255

In [36]:
test.img.shape

(642, 1366, 4)

In [78]:
np.array(test.show.resize((7, 1000))).shape

(1000, 7, 4)

In [None]:
# not all images are the same size

In [29]:
files = os.listdir("user_1000/screenshot_0/data/screenshots/parts")
imgs = []
for f in files: 
    if f.endswith('png'):
        imgs.append(ImageData('user_1000', f))

In [30]:
lens = [i.img.shape for i in imgs]

In [31]:
pd.Series(lens).value_counts()

(642, 1366)    210
(683, 1366)    202
dtype: int64

In [32]:
imgs[0].img

array([[228, 228, 228, ..., 228, 228, 228],
       [228, 228, 228, ..., 228, 228, 228],
       [228, 228, 228, ..., 228, 228, 228],
       ...,
       [228, 228, 228, ..., 228, 228, 228],
       [228, 228, 228, ..., 228, 228, 228],
       [228, 228, 228, ..., 228, 228, 228]], dtype=uint8)