In [None]:
import os
import pandas as pd
import numpy as np 
import re
from ast import literal_eval
import cv2

<a href = "https://towardsdatascience.com/exploring-image-processing-techniques-opencv-4860006a243">good primer on CV2</a>

In [None]:
user_look_up = pd.read_csv("../../files/user-summary.csv")

Note to self: turn site look up table creation stuff into an automated process

In [None]:
def create_site_look_up(user_id):
    '''
    Turns the commands file into a list of websites 
    that the simulated user visited and the asssociated
    suffixes in the image file names
    
    Naming convention works wether you use the 
    screenshot parts or the full image that has 
    been stitched together from the parts
    
    ARGS:
    user_id = used to identify the correct commands file 
    
    NOTE TO SELF: 
    asssumes everyone has the same path to the commands file
    come back to file paths 
    
    hypothetically this only needs to be run once, 
    since all users should have the same file structure
    '''
    commands = pd.read_csv(f"users/{user_id}/screenshot_0/commands_u{user_id[5:]}_s0.tsv", sep = "\t", header = None, index_col = 0).reset_index(drop = True)
    commands.columns = ['action', 'site']
    commands['site'] = commands.site.apply(literal_eval)
    
    suffix_to_site = {}
    for idx, _ in commands.groupby(commands.index // 2):
        if commands.iloc[idx * 2]['action'] == 'GET' and commands.iloc[idx * 2 + 1]['action'] == 'FULL SCREENSHOT':
            site = commands.iloc[idx * 2]['site']['url']
            suffix = commands.iloc[idx * 2 + 1]['site']['suffix'][1:]
            suffix_to_site[suffix] = site
            
    site_look_up = pd.DataFrame.from_dict(suffix_to_site, orient = 'index').reset_index()
    site_look_up.columns = ['suffix', 'site']
    
    return site_look_up

In [None]:
site_look_up = create_site_look_up('user_1000')

In [None]:
site_look_up = pd.read_csv("../../files/sites.csv")

In [None]:
class ImageData():
    def __init__(self, fname, colorway = cv2.COLOR_BGR2HLS, path = '', user_look_up = user_look_up, site_look_up = site_look_up):
        '''
        Class for screenshot data
        Mostly a convenience thing
        
        ARGS: 
        user_id = in the format 'user-####' (there can be any number of digits)
        fname = image file name
        path = optional, if images are in another directory than current
        user_look_up = Pandas dataframe that associates user id's with the 
                       simulated user's protected class
        site_look_up = Pandas dataframe that associates the suffix used in the 
                       image filename (fname) with the website that was visited
                       
        '''
        self.user = fname.split("_")[1]
        self.img = cv2.cvtColor(cv2.imread(path + fname), colorway)
        self.shape = self.img.shape
        
        self.user_class = user_look_up[user_look_up.user_id == int(fname.split("_")[1])]['group'].values[0]
        self.site = site_look_up[site_look_up.suffix == int(fname.split("_")[-1][:-4])]['site'].values[0]
        

    def resize(self, w, h):
        '''
        Resizes the image
        Returns None
        
        ARGS:
        w = width of the image's new shape
        h = height of the image's new shape
        
        '''
        self.img = cv2.resize(self.img, (20, 20), interpolation = cv2.INTER_AREA)
        self.shape = self.img.shape
        

In [None]:
w = ImageData('user_703_12.png', path = '/Users/haleyjohnson/Desktop/sample_screenshots/')

In [None]:
import matplotlib.pyplot as plt
plt.imshow(w.img)