In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Set working directory
os.chdir('C:/Users/user/Documents/Berkeley_MIDS/W207_Machine_Learning/w207_final_project')

In [3]:
# Load first npz file as an example and get files
data_1 = np.load('et_w207_project_npz_files_5000_tmp_tmp5ocrhjnn.npz')
data_1.close
data_1.files

['arr_0', 'arr_1']

In [4]:
# Show file arr_0 example
data_1['arr_0'][:5]

array(['ozbpir-9ey6js-ggdqwo', '378ykanq', 'p366md-big834-7o23k6',
       '3zruwvl2', 'q18iae-3vnh74-79npmy'], dtype='<U20')

In [5]:
# Show file arr_1 example
data_1['arr_1'][:5]

array([[-103.939   , -116.779   , -123.68    , ..., -103.939   ,
        -116.779   , -123.68    ],
       [ -46.939003,  -74.779   , -118.68    , ...,  -57.939003,
         -84.779   , -118.68    ],
       [ 151.061   ,  138.22101 ,  131.32    , ...,  151.061   ,
         138.22101 ,  131.32    ],
       [  -3.939003,  -66.779   ,  -96.68    , ...,   -3.939003,
         -66.779   ,  -96.68    ],
       [-103.939   , -116.779   , -123.68    , ..., -103.939   ,
        -116.779   , -123.68    ]], dtype=float32)

In [6]:
# Load all data files
data_2 = np.load('et_w207_project_npz_files_5000_tmp_tmp092sag67.npz')
data_2.close
print(data_2.files)

data_3 = np.load('et_w207_project_npz_files_5000_tmp_tmpf31_pn8p.npz')
data_3.close
print(data_3.files)

data_4 = np.load('et_w207_project_npz_files_5000_tmp_tmpq5b2g4n2.npz')
data_4.close
print(data_4.files)

['arr_0', 'arr_1']
['arr_0', 'arr_1']
['arr_0', 'arr_1']


In [7]:
# Get size of each file
print(data_1['arr_0'].shape)
print(data_1['arr_1'].shape)

(878,)
(878, 150528)


In [8]:
print(data_2['arr_0'].shape)
print(data_2['arr_1'].shape)

(893,)
(893, 150528)


In [9]:
print(data_3['arr_0'].shape)
print(data_3['arr_1'].shape)

(638,)
(638, 150528)


In [10]:
print(data_4['arr_0'].shape)
print(data_4['arr_1'].shape)

(1353,)
(1353, 150528)


In [11]:
# Concatenate image vectors from all files
data_arr0_all = np.concatenate((data_1['arr_0'], data_2['arr_0'], data_3['arr_0'], data_4['arr_0']))
data_arr0_all.shape

(3762,)

In [12]:
data_arr1_all = np.concatenate((data_1['arr_1'], data_2['arr_1'], data_3['arr_1'], data_4['arr_1']))
data_arr1_all.shape

(3762, 150528)

In [13]:
# Store pixel vectors as DF with attachment key

# Create key value pairs with arr0 (attachment key) and standardized arr1 (pixel array)
# Standardize pixel values between 0 and 1 by dividing by 255
data_dict = {}
for i in range(data_arr0_all.shape[0]):
    data_dict[data_arr0_all[i]] = [data_arr1_all[i]/255]

df_pixel = pd.DataFrame(data=data_dict)
df_pixel_t = df_pixel.transpose()
df_pixel_t.index.names = ['attachment_key']
df_pixel_t.columns = ['pixel_array']
df_pixel_t

Unnamed: 0_level_0,pixel_array
attachment_key,Unnamed: 1_level_1
ozbpir-9ey6js-ggdqwo,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
378ykanq,"[-0.18407452, -0.29325098, -0.46541175, -0.097..."
p366md-big834-7o23k6,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
3zruwvl2,"[-0.015447071, -0.26187843, -0.37913725, -0.01..."
q18iae-3vnh74-79npmy,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
...,...
pktyig-481wyg-6gyulf,"[-0.3605451, -0.41089803, -0.38305882, -0.3644..."
zu7y3rha,"[-0.007603933, -0.05795686, -0.08501961, -0.00..."
pxcwxf-8tnpyg-8vhb9r,"[0.3022, 0.18125883, 0.12674509, 0.31004313, 0..."
p4hakf-fsskmw-1224up,"[-0.40368235, -0.45403528, -0.48109803, -0.387..."


In [14]:
# Load excel data
df = pd.read_csv('school_project_data_set-query.csv')
df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filename,logo
0,q36z0t-gf0ta0-91uq4d,5.256976,3351.0,2530.0,1.324506,WVC021819TH-5756.jpg,0
1,p0td2i-8fv5ew-2c7suq,0.688739,3420.0,1365.0,2.505495,Lastricato_CastelloDelTrebbio.jpg,0
2,q6p0vh-6wed5k-3c8m2s,11.636952,6240.0,4160.0,1.5,WVC2Port2020-9684.jpg,0
3,pg1c3o-fqyk3c-72hr82,4.161787,3257.0,4885.0,0.666735,EB4A1971.jpg,0
4,q52mox-a2mcsg-4ir06w,4.081567,3738.0,5600.0,0.6675,934879096.jpg,0


In [15]:
# Merge dataframes by attachment key
df.merge(df_pixel_t, on='attachment_key')

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filename,logo,pixel_array
0,q4zhny-2mxzu8-fz6s49,2.423621,4500.0,4500.0,1.000000,Ska-HazyIPA-can-White.jpg,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
1,pgeoak-9l272o-5dah6f,0.014292,137.0,327.0,0.418960,Screen Shot 2018-10-10 at 7.07.12 PM.png,0,"[0.5492588, 0.5224353, 0.49145097, 0.5492588, ..."
2,pgeoak-9l272o-5dah6f,0.014292,137.0,327.0,0.418960,Screen Shot 2018-10-10 at 7.07.12 PM.png,1,"[0.5492588, 0.5224353, 0.49145097, 0.5492588, ..."
3,vd92yz5a,0.109244,716.0,1428.0,0.501401,a pha_logo.jpg,1,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
4,pu19n0-ajc888-actmf5,0.027520,1251.0,1250.0,1.000800,AXON - Icon Family - July 2019-03.png,1,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
...,...,...,...,...,...,...,...,...
4422,px1yxs-mfwvk-az49zm,0.045961,333.0,500.0,0.666000,zoom_se_rts400_574634_a_15a.jpg,0,"[0.36886665, 0.3694941, 0.39733332, 0.36886665..."
4423,q6mxyb-ferzig-bai2df,0.220249,352.0,500.0,0.704000,zoom_se_etsec150_576329_sod_01b_US.jpg,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
4424,olffz6-6619ps-53fw6l,0.006069,162.0,500.0,0.324000,web-logo-run-gum-energy-gum.png,1,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
4425,px1yxs-mfwvk-174b2s,0.071568,540.0,500.0,1.080000,zoom_fr_sys_498889_z_01a.jpg,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."


In [None]:
# STILL WORKING
# Split into train and test data
# Development data?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                    random_state=1, stratify=y)

In [None]:
# Standardize