# Data Preprocessing for PetFinder6000

In [1]:
import pandas as pd
import numpy as np
import re

## Users

In [7]:
users = pd.read_csv('../../data/users_6-10-2023.csv', header=0)

In [4]:
# rename headers
cl_users = users.rename(columns={'A_gender': 'gender',
                              'A_primarycolor': 'primary_color',
                              'A_agegroup': 'age_group',
                              'A_energy': 'energy_level',
                              'A_attention': 'attention_need',
                              'A_sweetspicy': 'personality',
                              'A_firstcat': 'is_first_cat',
                              'A_othercats': 'has_other_cats',
                              'A_otherdogs': 'good_with_other_dogs',
                              'A_kids': 'good_with_kids',
                              'A_employment': 'employment',
                              'A_homeownership': 'home_ownership',
                              'A_allergies': 'has_allergies',
                              'A_adoptionfee': 'agree_to_fee',
                              'createdAt': 'created_at',
                              'updatedAt': 'updated_at',
                              })

In [5]:
# clean multi-select columns with No Preference options (age, color)

def clean_multi_select(row):
    arr = row.split(',')
    if (len(arr) > 1) and ('No preference' in arr):
        arr.remove('No preference')
    return [s.lower() for s in arr]

cl_users['age_group'] = cl_users['age_group'].map(lambda choice: clean_multi_select(choice))
cl_users['primary_color'] = cl_users['primary_color'].map(lambda choice: clean_multi_select(choice))

In [6]:
# split columns with list (age, color)
age_groups = cl_users['age_group'].explode().unique().tolist()
split_age_groups = cl_users['age_group'].map(lambda row: ','.join([str(age in row) for age in age_groups]))
new_age_columns = split_age_groups.str.split(',', expand=True)
new_age_columns = new_age_columns.applymap(lambda val: (val == 'True'))
new_age_columns = new_age_columns.astype('bool')

pattern = re.compile(r'\s|/')
# new_age_columns.columns = [f'age_{pattern.sub("_", age).lower()}' for age in age_groups]
cl_users[[f'age_{pattern.sub("_", age).lower()}' for age in age_groups]] = new_age_columns
# cl_users = cl_users.drop('age_group', axis=1)

color_groups = cl_users['primary_color'].explode().unique().tolist()
split_color_groups = cl_users['primary_color'].map(lambda row: ','.join([str(color in row) for color in color_groups]))
new_color_columns = split_color_groups.str.split(',', expand=True)
new_color_columns = new_color_columns.applymap(lambda val: (val == 'True'))
new_color_columns = new_color_columns.astype('bool')

cl_users[[f'primary_color_{pattern.sub("_", color).lower()}' for color in color_groups]] = new_color_columns
# cl_users = cl_users.drop('primary_color', axis=1)

In [7]:
# convert string fields to lower case (gender, energy_level, attention_need, personality, employment, home_ownership)
cl_users['gender'] = cl_users['gender'].map(lambda val: val.lower())
cl_users['energy_level'] = cl_users['energy_level'].map(lambda val: val.lower())
cl_users['attention_need'] = cl_users['attention_need'].map(lambda val: val.lower())
cl_users['personality'] = cl_users['personality'].map(lambda val: val.lower())
cl_users['employment'] = cl_users['employment'].map(lambda val: val.lower())
cl_users['home_ownership'] = cl_users['home_ownership'].map(lambda val: val.lower())

In [8]:
# # convert int booleans to actual booleans (is_first_cat, has_other_cats, good_with_other_dogs, good_with_kids, has_allergies, agree_to_fee)
cl_users['is_first_cat'] = cl_users['is_first_cat'].map(lambda val: (val == 1))
cl_users['has_other_cats'] = cl_users['has_other_cats'].map(lambda val: (val == 1))
cl_users['good_with_other_dogs'] = cl_users['good_with_other_dogs'].map(lambda val: (val == 1))
cl_users['good_with_kids'] = cl_users['good_with_kids'].map(lambda val: (val == 1))
cl_users['has_allergies'] = cl_users['has_allergies'].map(lambda val: (val == 1))
cl_users['agree_to_fee'] = cl_users['agree_to_fee'].map(lambda val: (val == 1))

In [9]:
# convert types
cl_users = cl_users.astype({'gender': 'category',
                          'energy_level': 'category',
                          'attention_need': 'category',
                          'personality': 'category',
                          'is_first_cat': 'bool',
                          'has_other_cats': 'bool',
                          'good_with_other_dogs': 'bool',
                          'good_with_kids': 'bool',
                          'employment': 'category',
                          'home_ownership': 'category',
                          'has_allergies': 'bool',
                          'agree_to_fee': 'bool',
                          })

cl_users['created_at'] = pd.to_datetime(cl_users['created_at'])
cl_users['updated_at'] = pd.to_datetime(cl_users['updated_at'])

In [9]:
# cl_users.columns

In [10]:
# user_pref = pd.DataFrame()
# user_pref['id'] = cl_users['id']

In [11]:
# user_pref_cols = ['gender', 'primary_color', 'age_group',
#        'energy_level', 'attention_need', 'personality', 'is_first_cat',
#        'has_other_cats', 'good_with_other_dogs', 'good_with_kids',
#        'employment', 'home_ownership', 'has_allergies', 'agree_to_fee']
#
# user_pref['preference'] = cl_users.apply(lambda row: '\n'.join([f'{pref}: {row[pref]}' for pref in user_pref_cols]), axis=1)

In [12]:
# user_pref.head(5)

In [10]:
cl_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   id                           107 non-null    object             
 1   username                     107 non-null    object             
 2   gender                       107 non-null    category           
 3   primary_color                107 non-null    object             
 4   age_group                    107 non-null    object             
 5   energy_level                 107 non-null    category           
 6   attention_need               107 non-null    category           
 7   personality                  107 non-null    category           
 8   is_first_cat                 107 non-null    bool               
 9   has_other_cats               107 non-null    bool               
 10  good_with_other_dogs         107 non-null    bool 

In [11]:
cl_users.head()

Unnamed: 0,id,username,gender,primary_color,age_group,energy_level,attention_need,personality,is_first_cat,has_other_cats,...,age_adult,age_juvenile,age_senior,primary_color_no_preference,primary_color_ginger,primary_color_black,primary_color_white,primary_color_calico_tortie,primary_color_others,primary_color_tabby
0,8b1aa3c3-162f-4717-a9f5-d895ec52f2a4,Chai Chai,no preference,[no preference],[no preference],no preference,no preference,anything is nice,True,False,...,False,False,False,True,False,False,False,False,False,False
1,9b6bed1a-6d10-40eb-b320-7d12ff653ff9,bob the builder,no preference,[ginger],[kitten],chill,independent,anything is nice,True,False,...,False,False,False,False,True,False,False,False,False,False
2,bfd0401f-ab32-4de6-af44-745c940b5f64,Robertaa,male,[no preference],[no preference],chill,independent,anything is nice,True,False,...,False,False,False,True,False,False,False,False,False,False
3,c0d37cae-c1fc-4aca-9135-0e1d4f222a99,Raiken01,no preference,[no preference],[no preference],chill,independent,anything is nice,True,False,...,False,False,False,True,False,False,False,False,False,False
4,4e663992-2176-4a8f-a879-b802ef8d88bb,JeanneTan,no preference,[no preference],[adult],chill,no preference,some spice,True,True,...,True,False,False,True,False,False,False,False,False,False


In [12]:
cl_users.to_csv('../../data/auxiliary/users.csv')

## Cats

In [12]:
cats = pd.read_csv('../../data/cats_6-10-2023.csv', header=0)

In [13]:
# rename headers
cl_cats = cats.rename(columns={'C_gender': 'gender',
                              'C_primarycolor': 'primary_color',
                              'C_agegroup': 'age_group',
                              'C_energy': 'energy_level',
                              'C_attention': 'attention_need',
                              'C_sweetspicy': 'personality',
                              'C_firstcat': 'good_first_cat',
                              'C_othercats': 'good_with_other_cats',
                              'C_otherdogs': 'good_with_other_dogs',
                              'C_kids': 'good_with_kids',
                              'C_employment': 'preferred_employment',
                              'C_homeownership': 'preferred_home_ownership',
                              'C_allergies': 'good_with_allergies',
                              'C_adoptionfee': 'require_fee',
                              'createdAt': 'created_at',
                              'updatedAt': 'updated_at',
                              })

In [14]:
# fill attention and personality columns as neutral
cl_cats['attention_need'] = cl_cats['attention_need'].fillna('neutral')
cl_cats['personality'] = cl_cats['personality'].fillna('neutral')

# fill empty description with no description available
cl_cats['description'] = cl_cats['description'].fillna('no description available')

In [15]:
# map integers in categorical columns
def map_categorical(val):
    if val == 1:
        return 'yes'
    elif val == -1:
        return 'no'
    else:
        return 'neutral'

cl_cats['good_first_cat'] = cl_cats['good_first_cat'].map(lambda val: map_categorical(val))
cl_cats['good_with_other_cats'] = cl_cats['good_with_other_cats'].map(lambda val: map_categorical(val))
cl_cats['good_with_other_dogs'] = cl_cats['good_with_other_dogs'].map(lambda val: map_categorical(val))
cl_cats['good_with_kids'] = cl_cats['good_with_kids'].map(lambda val: map_categorical(val))

In [16]:
# convert string fields to lower case (gender, breed, primary_color, age_group, energy_level, attention_need, personality, preferred_employment, preferred_home_ownership, require_fee)
cl_cats['gender'] = cl_cats['gender'].map(lambda val: val.lower())
cl_cats['breed'] = cl_cats['breed'].map(lambda val: val.lower())
cl_cats['primary_color'] = cl_cats['primary_color'].map(lambda val: val.lower())
cl_cats['age_group'] = cl_cats['age_group'].map(lambda val: val.lower())
cl_cats['energy_level'] = cl_cats['energy_level'].map(lambda val: val.lower())
cl_cats['attention_need'] = cl_cats['attention_need'].map(lambda val: val.lower())
cl_cats['personality'] = cl_cats['personality'].map(lambda val: val.lower())
cl_cats['preferred_employment'] = cl_cats['preferred_employment'].map(lambda val: val.lower())
cl_cats['preferred_home_ownership'] = cl_cats['preferred_home_ownership'].map(lambda val: val.lower())
cl_cats['require_fee'] = cl_cats['require_fee'].map(lambda val: val.lower())

cl_cats['description'] = cl_cats['description'].map(lambda val: val.lower())
cl_cats['details'] = cl_cats['details'].map(lambda val: val.lower())

In [17]:
# convert types
cl_cats = cl_cats.astype({'gender': 'category',
                          'breed': 'category',
                          'primary_color': 'category',
                          'age_group': 'category',
                          'energy_level': 'category',
                          'attention_need': 'category',
                          'personality': 'category',
                          'good_first_cat': 'category',
                          'good_with_other_cats': 'category',
                          'good_with_other_dogs': 'category',
                          'good_with_kids': 'category',
                          'preferred_employment': 'category',
                          'preferred_home_ownership': 'category',
                          'good_with_allergies': 'category',
                          'require_fee': 'category',
                          })

In [18]:
cl_cats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   id                        404 non-null    object  
 1   cws_id                    404 non-null    object  
 2   image_url                 404 non-null    object  
 3   name                      404 non-null    object  
 4   age_months                404 non-null    int64   
 5   gender                    404 non-null    category
 6   breed                     404 non-null    category
 7   primary_color             404 non-null    category
 8   description               404 non-null    object  
 9   details                   404 non-null    object  
 10  playful                   404 non-null    bool    
 11  active                    404 non-null    bool    
 12  curious                   404 non-null    bool    
 13  talkative                 404 non-null    bool    

In [19]:
cl_cats.head()

Unnamed: 0,id,cws_id,image_url,name,age_months,gender,breed,primary_color,description,details,...,good_first_cat,good_with_other_cats,good_with_other_dogs,good_with_kids,preferred_employment,preferred_home_ownership,good_with_allergies,require_fee,created_at,updated_at
0,bd8c1b25-bb4b-4d0d-ac37-3d6799d885f9,1681491353-164,https://petfinder6000images.s3.ap-southeast-1....,Michael Ming,36,male,domestic short hair,tabby,"shy, spicy","feb 2022\none night late 2021, ming’s ex-owner...",...,neutral,neutral,neutral,neutral,student,parents,no allergies,yes,2023-05-15T02:51:33.203Z,2023-05-15T02:51:33.203Z
1,c822a63a-ad76-4302-af0a-ef17f4ae3680,1681491768-348,https://petfinder6000images.s3.ap-southeast-1....,Muffin,18,male,domestic short hair,others,"loving, sweet, likes to be held",cat available for adoption\n\n\n\nmuffin\nmale...,...,neutral,neutral,neutral,neutral,working full time,owned,no allergies,no preference,2023-05-15T02:51:44.549Z,2023-05-15T02:51:44.549Z
2,feee6a5f-d978-4337-9ba5-dfa1c9faeb46,1681491178-86,https://petfinder6000images.s3.ap-southeast-1....,Chubby and Snowy looking for a new home,108,male,domestic short hair,black,loving,name: chubby and snowy\nage: 9 years old\nbree...,...,neutral,yes,neutral,neutral,working part time,rental with landlord’s consent,no allergies,yes,2023-05-15T02:51:25.990Z,2023-05-15T02:51:25.990Z
3,993c537d-c4e3-4343-854a-b36068506d8e,1681491761-345,https://petfinder6000images.s3.ap-southeast-1....,Tomoe,48,female,domestic short hair,tabby,"loving, sweet",tomoe\n\nage: about 4 years old\n\ngender: fem...,...,neutral,no,neutral,neutral,working full time,owned,no preference,yes,2023-05-15T02:51:44.221Z,2023-05-15T02:51:44.221Z
4,eb11adfa-ba0a-4b83-8989-b78c9ecc1a21,1681491518-238,https://petfinder6000images.s3.ap-southeast-1....,Samantha,48,female,domestic short hair,white,"talkative, loving, sweet",she literally fell into fosterer ‘s hands by c...,...,neutral,no,neutral,neutral,working full time,no preference,no allergies,no preference,2023-05-15T02:51:14.022Z,2023-05-15T02:51:14.022Z


In [16]:
cl_cats.to_csv('../../data/auxiliary/cats.csv')

## Cat Profile Images

In [8]:
import cv2
import tensorflow as tf

In [9]:
new_size = 128
dir = "C:/Users/yongr/Non OneDrive/petfinder6000images/"

In [49]:
def vectorise(cws_id):
    file = f'{dir}cropped_{cws_id}.jpg'

    img = cv2.imread(file)
    try:
        resize = cv2.resize(img, (new_size, new_size))
        resize = cv2.cvtColor(resize,cv2.COLOR_BGR2RGB)
        resize = np.array(resize.tolist())/255.0
        return resize
        # img_flat = resize.reshape(-1)
        # return str(np.array2string(img_flat, precision=2, separator=',', suppress_small=True)).encode('utf-8')
    except:
        print(file)

In [86]:
cat_images = cl_cats.loc[:, ['id', 'cws_id', 'updated_at']]
cat_images['img_shape'] = str([new_size, new_size, 3])
cat_images['img_vector'] = cat_images['cws_id'].map(lambda cws_id: vectorise(cws_id))

In [87]:
img_vectors = cat_images['img_vector'].values

In [88]:
# Using a pretrained model to generate features

##############################
# ### Bespoke CNN1
# cnn1 = Sequential()
# cnn1.add(Conv2D(64, (5, 5), input_shape=INPUT_SHAPE, activation='relu'))
# cnn1.add(MaxPooling2D((2, 2)))
# cnn1.add(Conv2D(32, (3, 3), activation='relu'))
# cnn1.add(MaxPooling2D((2, 2)))
# cnn1.add(Dropout(0.2))
# cnn1.add(Flatten())
# cnn1.add(Dense(64, activation='relu'))
# cnn1.add(Dense(128, activation='relu'))

##############################
#### Importing pretrained MobileNetV2
mnetv2_base = tf.keras.applications.mobilenet_v2.MobileNetV2(input_shape=(new_size,new_size,3), include_top=False, weights='imagenet')

# Freezing layers
for layer in mnetv2_base.layers:
    layer.trainable = False

# ##############################
# ## Importing pretrained ResNet50
# # Performance not as good as MobileNet, and takes longer
# resnet50_base = tf.keras.applications.resnet50.ResNet50(input_shape=INPUT_SHAPE, include_top=False, weights= 'imagenet',
#                                                      pooling='max')
# # Freezing layers
# for layer in resnet50_base.layers:
#     layer.trainable = False


# ##############################
# ## Importing pretrained InceptionV3
# # Performance not as good as MobileNet, and takes longer
# inceptionv3_base = tf.keras.applications.inception_v3.InceptionV3(input_shape=INPUT_SHAPE, include_top=False, weights= 'imagenet',
#                                                      pooling='max')
# # Freezing layers
# for layer in inceptionv3_base.layers:
#     layer.trainable = False


### We can add mode layers to pool and reduce features but the performance won't be as good
# base_model = tf.keras.Sequential([
#   mnetv2_base,
#   keras.layers.GlobalAveragePooling2D(),
#   #keras.layers.Dense(102, activation='sigmoid')
# ])


# defining a function to extract features
def img_feature_extraction(img_vectors, pre_model):

    batch_img = []

    # preprocessing and then using pretrained model to extract features
    for image in img_vectors:
        im_toarray = tf.keras.preprocessing.image.img_to_array(image)

        im_toarray = np.expand_dims(image, axis=0)
        im_toarray = tf.keras.applications.mobilenet.preprocess_input(im_toarray)

        batch_img.append(im_toarray)

    batch_img = np.vstack(batch_img)
    features = pre_model.predict(batch_img, batch_size=64)
    features= features.reshape((len(img_vectors), -1))
    return features

In [89]:
# Extract features
features = img_feature_extraction(img_vectors, mnetv2_base)

features.shape



(404, 20480)

In [90]:
# add the feature vectors to dataframe
feature_vectors = pd.Series(features.tolist(), index = cat_images.index)
cat_images = cat_images.merge(feature_vectors.rename('feature_vectors'), left_index=True, right_index=True)

In [91]:
# stringify vectors
cat_images['feature_vectors']=cat_images['feature_vectors'].astype(pd.StringDtype())

In [92]:
cat_images['img_vector']=cat_images['img_vector'].apply(lambda x: x.reshape(-1).tolist())
cat_images['img_vector']=cat_images['img_vector'].astype(pd.StringDtype())

In [93]:
cat_images.head()

Unnamed: 0,id,cws_id,updated_at,img_shape,img_vector,feature_vectors
0,bd8c1b25-bb4b-4d0d-ac37-3d6799d885f9,1681491353-164,2023-05-15T02:51:33.203Z,"[128, 128, 3]","[-0.9955094194540561, -0.9949865436370626, -0....","[0.0, 2.0021913051605225, 0.0, 0.0, 0.0, 0.0, ..."
1,c822a63a-ad76-4302-af0a-ef17f4ae3680,1681491768-348,2023-05-15T02:51:44.549Z,"[128, 128, 3]","[-0.9921568627450981, -0.9921568627450981, -0....","[0.0, 2.053255796432495, 0.0, 0.0, 0.0, 0.0, 0..."
2,feee6a5f-d978-4337-9ba5-dfa1c9faeb46,1681491178-86,2023-05-15T02:51:25.990Z,"[128, 128, 3]","[-0.9922491349480969, -0.9926489811610919, -0....","[0.0, 1.9501960277557373, 0.0, 0.0, 0.0, 0.0, ..."
3,993c537d-c4e3-4343-854a-b36068506d8e,1681491761-345,2023-05-15T02:51:44.221Z,"[128, 128, 3]","[-0.9921568627450981, -0.9921568627450981, -0....","[0.0, 2.084104299545288, 0.0, 0.0, 0.0, 0.0, 0..."
4,eb11adfa-ba0a-4b83-8989-b78c9ecc1a21,1681491518-238,2023-05-15T02:51:14.022Z,"[128, 128, 3]","[-0.9994463667820069, -0.9993233371780085, -0....","[0.0, 2.0694634914398193, 0.0, 0.0, 0.0, 0.0, ..."


In [94]:
cat_images.loc[0, 'img_vector']

'[-0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.995601691657055, -0.9950788158400615, -0.9948327566320646, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955709342560554, -0.9950480584390619, -0.994801999

In [95]:
def cast_object_to_string(data_frame):
    """
    Cast all columns of data_frame of type object to type string and return it.
    Parameters:
        data_frame: A pandas Dataframe
    Returns:
        Data frame
    """
    for label in data_frame.columns:
        if data_frame.dtypes[label] == object:
            data_frame[label] = data_frame[label].astype("str").astype("string")
    return data_frame

cat_images = cast_object_to_string(cat_images)

In [22]:
# cat_images = cat_images.reset_index()

In [96]:
cat_images.dtypes

id                 string[python]
cws_id             string[python]
updated_at         string[python]
img_shape          string[python]
img_vector         string[python]
feature_vectors    string[python]
dtype: object

In [97]:
cat_images.head()

Unnamed: 0,id,cws_id,updated_at,img_shape,img_vector,feature_vectors
0,bd8c1b25-bb4b-4d0d-ac37-3d6799d885f9,1681491353-164,2023-05-15T02:51:33.203Z,"[128, 128, 3]","[-0.9955094194540561, -0.9949865436370626, -0....","[0.0, 2.0021913051605225, 0.0, 0.0, 0.0, 0.0, ..."
1,c822a63a-ad76-4302-af0a-ef17f4ae3680,1681491768-348,2023-05-15T02:51:44.549Z,"[128, 128, 3]","[-0.9921568627450981, -0.9921568627450981, -0....","[0.0, 2.053255796432495, 0.0, 0.0, 0.0, 0.0, 0..."
2,feee6a5f-d978-4337-9ba5-dfa1c9faeb46,1681491178-86,2023-05-15T02:51:25.990Z,"[128, 128, 3]","[-0.9922491349480969, -0.9926489811610919, -0....","[0.0, 1.9501960277557373, 0.0, 0.0, 0.0, 0.0, ..."
3,993c537d-c4e3-4343-854a-b36068506d8e,1681491761-345,2023-05-15T02:51:44.221Z,"[128, 128, 3]","[-0.9921568627450981, -0.9921568627450981, -0....","[0.0, 2.084104299545288, 0.0, 0.0, 0.0, 0.0, 0..."
4,eb11adfa-ba0a-4b83-8989-b78c9ecc1a21,1681491518-238,2023-05-15T02:51:14.022Z,"[128, 128, 3]","[-0.9994463667820069, -0.9993233371780085, -0....","[0.0, 2.0694634914398193, 0.0, 0.0, 0.0, 0.0, ..."


In [98]:
cat_images.loc[0, 'img_vector']

'[-0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.995601691657055, -0.9950788158400615, -0.9948327566320646, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955401768550558, -0.9950173010380623, -0.9947712418300654, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955094194540561, -0.9949865436370626, -0.9947404844290657, -0.9955709342560554, -0.9950480584390619, -0.994801999

In [99]:
cat_images.to_pickle('../../data/auxiliary/cat_images.pkl')

## Interactions

In [13]:
interactions = pd.read_csv('../../data/interactions_6-10-2023.csv', header=0)

In [14]:
interactions = interactions.drop('rating', axis=1)

In [15]:
# rename headers
cl_interactions = interactions.rename(columns={
                              'createdAt': 'created_at',
                              'updatedAt': 'updated_at',
                              })

# convert types
cl_interactions['created_at'] = pd.to_datetime(cl_interactions['created_at'])
cl_interactions['updated_at'] = pd.to_datetime(cl_interactions['updated_at'])

In [16]:
cl_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4841 entries, 0 to 4840
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   id             4841 non-null   object             
 1   catID          4841 non-null   object             
 2   userID         4841 non-null   object             
 3   like           4841 non-null   bool               
 4   dwell_time_ms  4841 non-null   int64              
 5   click          4841 non-null   bool               
 6   created_at     4841 non-null   datetime64[ns, UTC]
 7   updated_at     4841 non-null   datetime64[ns, UTC]
dtypes: bool(2), datetime64[ns, UTC](2), int64(1), object(3)
memory usage: 236.5+ KB


In [17]:
cl_interactions.head()

Unnamed: 0,id,catID,userID,like,dwell_time_ms,click,created_at,updated_at
0,a02ad399-01b5-45d3-9529-ba54b0e74e62,a9eebe65-702f-4b34-bc2e-42153078e1d9,0cf2ab4c-91a8-46d5-821c-576de66585c5,False,3578,True,2023-05-17 14:31:39.037000+00:00,2023-05-17 14:31:39.037000+00:00
1,d43b865f-c13d-4ee0-a990-ca8d7184bb8d,3d2e4ecc-d9e0-48c9-b851-ebe8fdd9ede3,c5ab4a57-839d-40c9-ad03-70072da2d078,True,718,False,2023-05-22 15:35:17.324000+00:00,2023-05-22 15:35:17.324000+00:00
2,64fceb4a-9fbf-48e4-9bf0-478d292caf4b,20106676-a088-44e3-b23e-286b6e37f1d5,c5ab4a57-839d-40c9-ad03-70072da2d078,True,783,False,2023-05-22 15:35:39.907000+00:00,2023-05-22 15:35:39.907000+00:00
3,27d6292a-2584-4aaf-b016-c504c484eb54,2b87337b-1179-42c8-b423-c0ec34bb6833,081f358f-9624-468a-b6ef-f9b9fad2b3b3,True,1574,False,2023-05-17 08:44:22.617000+00:00,2023-05-17 08:44:22.617000+00:00
4,5004d90f-1b34-4c4c-8400-3fba142ec079,54197c51-1b76-4854-9349-e2e2d55bd71d,b2fe504c-8b80-4f50-ab07-12b6e7ff8cd2,False,869,True,2023-06-09 11:30:34.791000+00:00,2023-06-09 11:30:34.791000+00:00


## Train/Test Split

In [18]:
from sklearn.model_selection import train_test_split

test_size = 0.2
validation_size = 0.5 # of test size
random_state = 2023

Leave some users out

In [25]:
users = cl_interactions['userID'].unique()

train_users, test_users = train_test_split(users, test_size=test_size, shuffle=True, random_state=random_state)
validation_users, test_users = train_test_split(test_users, test_size=validation_size, shuffle=True, random_state=random_state)

In [26]:
train_set = cl_interactions[cl_interactions['userID'].isin(train_users)]
validation_set = cl_interactions[cl_interactions['userID'].isin(validation_users)]
test_set = cl_interactions[cl_interactions['userID'].isin(test_users)]

In [27]:
print(f'Total number of users: {cl_interactions["userID"].nunique()}')
print(f'Number of users in training: {train_set["userID"].nunique()}')
print(f'Number of users in validation: {validation_set["userID"].nunique()}')
print(f'Number of users in test: {test_set["userID"].nunique()}')

Total number of users: 104
Number of users in training: 83
Number of users in validation: 10
Number of users in test: 11


In [28]:
train_set.to_csv('../../data/output/lsuo_train.csv')
validation_set.to_csv('../../data/output/lsuo_validation.csv')
test_set.to_csv('../../data/output/lsuo_test.csv')

Stratified Split

In [29]:
train_set = cl_interactions.groupby('userID').sample(frac=1-test_size, random_state=random_state)
test_set = cl_interactions.drop(train_set.index)

validation_set = test_set.groupby('userID').sample(frac=validation_size, random_state=random_state)
test_set = test_set.drop(validation_set.index)

In [30]:
print(f'Total number of users: {cl_interactions["userID"].nunique()}')
print(f'Number of users in training: {train_set["userID"].nunique()}')
print(f'Number of users in validation: {validation_set["userID"].nunique()}')
print(f'Number of users in test: {test_set["userID"].nunique()}')

Total number of users: 104
Number of users in training: 104
Number of users in validation: 96
Number of users in test: 100


In [31]:
train_set.to_csv('../../data/output/strat_train.csv')
validation_set.to_csv('../../data/output/strat_validation.csv')
test_set.to_csv('../../data/output/strat_test.csv')

# Test Loading of Data

In [100]:
cat_images_test = pd.read_pickle('../../data/auxiliary/cat_images.pkl')

In [101]:
cat_images_test.head()

Unnamed: 0,id,cws_id,updated_at,img_shape,img_vector,feature_vectors
0,bd8c1b25-bb4b-4d0d-ac37-3d6799d885f9,1681491353-164,2023-05-15T02:51:33.203Z,"[128, 128, 3]","[-0.9955094194540561, -0.9949865436370626, -0....","[0.0, 2.0021913051605225, 0.0, 0.0, 0.0, 0.0, ..."
1,c822a63a-ad76-4302-af0a-ef17f4ae3680,1681491768-348,2023-05-15T02:51:44.549Z,"[128, 128, 3]","[-0.9921568627450981, -0.9921568627450981, -0....","[0.0, 2.053255796432495, 0.0, 0.0, 0.0, 0.0, 0..."
2,feee6a5f-d978-4337-9ba5-dfa1c9faeb46,1681491178-86,2023-05-15T02:51:25.990Z,"[128, 128, 3]","[-0.9922491349480969, -0.9926489811610919, -0....","[0.0, 1.9501960277557373, 0.0, 0.0, 0.0, 0.0, ..."
3,993c537d-c4e3-4343-854a-b36068506d8e,1681491761-345,2023-05-15T02:51:44.221Z,"[128, 128, 3]","[-0.9921568627450981, -0.9921568627450981, -0....","[0.0, 2.084104299545288, 0.0, 0.0, 0.0, 0.0, 0..."
4,eb11adfa-ba0a-4b83-8989-b78c9ecc1a21,1681491518-238,2023-05-15T02:51:14.022Z,"[128, 128, 3]","[-0.9994463667820069, -0.9993233371780085, -0....","[0.0, 2.0694634914398193, 0.0, 0.0, 0.0, 0.0, ..."


In [102]:
cat_images_test.loc[1, ['feature_vectors']].values

<StringArray>
['[0.0, 2.053255796432495, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13576221466064453, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08492588996887207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.4036872386932373, 1.5392704010009766, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6088454723358154, 0.2833085060119629, 0.8066463470458984, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04968523979187012, 1.331214427947998, 0.5192303657531738, 1.853182315826416, 0.0, 0.0, 0.0, 0.0, 0.241105318069458, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.48163318634033203, 0.0, 0.0, 0.1857280731201172, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.04752516746521, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1201174259185791, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0498645305633545, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0394806861877441, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0419387817382812, 