In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import tensorflow as tf


In [3]:
# Set working directory
os.chdir('/Users/alyssaaugsburger/Documents/Berkeley_MIDS/W207/Final_Project')

In [4]:
# Load first npz file as an example and get files
data_1 = np.load('et_w207_project_npz_files_5000_tmp_tmp5ocrhjnn.npz')
data_1.close
data_1.files

['arr_0', 'arr_1']

In [5]:
# Show file arr_0 example
data_1['arr_0'][:5]

array(['ozbpir-9ey6js-ggdqwo', '378ykanq', 'p366md-big834-7o23k6',
       '3zruwvl2', 'q18iae-3vnh74-79npmy'], dtype='<U20')

In [6]:
# Show file arr_1 example
data_1['arr_1'][:5]

array([[-103.939   , -116.779   , -123.68    , ..., -103.939   ,
        -116.779   , -123.68    ],
       [ -46.939003,  -74.779   , -118.68    , ...,  -57.939003,
         -84.779   , -118.68    ],
       [ 151.061   ,  138.22101 ,  131.32    , ...,  151.061   ,
         138.22101 ,  131.32    ],
       [  -3.939003,  -66.779   ,  -96.68    , ...,   -3.939003,
         -66.779   ,  -96.68    ],
       [-103.939   , -116.779   , -123.68    , ..., -103.939   ,
        -116.779   , -123.68    ]], dtype=float32)

In [7]:
# Load all data files
data_2 = np.load('et_w207_project_npz_files_5000_tmp_tmp092sag67.npz')
data_2.close
print(data_2.files)

data_3 = np.load('et_w207_project_npz_files_5000_tmp_tmpf31_pn8p.npz')
data_3.close
print(data_3.files)

data_4 = np.load('et_w207_project_npz_files_5000_tmp_tmpq5b2g4n2.npz')
data_4.close
print(data_4.files)

['arr_0', 'arr_1']
['arr_0', 'arr_1']
['arr_0', 'arr_1']


In [8]:
# Get size of each file
print(data_1['arr_0'].shape)
print(data_1['arr_1'].shape)

(878,)
(878, 150528)


In [9]:
print(data_2['arr_0'].shape)
print(data_2['arr_1'].shape)

(893,)
(893, 150528)


In [10]:
print(data_3['arr_0'].shape)
print(data_3['arr_1'].shape)

(638,)
(638, 150528)


In [11]:
print(data_4['arr_0'].shape)
print(data_4['arr_1'].shape)

(1353,)
(1353, 150528)


In [12]:
# Concatenate image vectors from all files
data_arr0_all = np.concatenate((data_1['arr_0'], data_2['arr_0'], data_3['arr_0'], data_4['arr_0']))
data_arr0_all.shape

(3762,)

In [13]:
data_arr1_all = np.concatenate((data_1['arr_1'], data_2['arr_1'], data_3['arr_1'], data_4['arr_1']))
data_arr1_all.shape

(3762, 150528)

In [14]:
# Store pixel vectors as DF with attachment key

# Create key value pairs with arr0 (attachment key) and standardized arr1 (pixel array)
# Standardize pixel values between 0 and 1 by dividing by 255
data_dict = {}
for i in range(data_arr0_all.shape[0]):
    data_dict[data_arr0_all[i]] = [data_arr1_all[i]/255]

df_pixel = pd.DataFrame(data=data_dict)
df_pixel_t = df_pixel.transpose()
df_pixel_t.index.names = ['attachment_key']
df_pixel_t.columns = ['pixel_array']
df_pixel_t

Unnamed: 0_level_0,pixel_array
attachment_key,Unnamed: 1_level_1
ozbpir-9ey6js-ggdqwo,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
378ykanq,"[-0.18407452, -0.29325098, -0.46541175, -0.097..."
p366md-big834-7o23k6,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
3zruwvl2,"[-0.015447071, -0.26187843, -0.37913725, -0.01..."
q18iae-3vnh74-79npmy,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
q0nyct-7525ow-3ur3ii,"[0.5767098, 0.5263569, 0.4992941, 0.49043527, ..."
7d8atetr,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
ppfy71-bfbnm8-emjouu,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
pe7vgl-2vda1s-fo18uc,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
q7clmn-75pmlk-9pzb8r,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."


In [15]:
# Load excel data
df = pd.read_csv('school_project_data_set-query.csv')
df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filename,logo
0,001lq61k,0.12252,514.0,720.0,0.713889,campus ministry on the beach.jpg,0
1,002dlirq,3.256944,4072.0,3868.0,1.052741,BIG ALS FISH FLASH UV VP PL.jpg,0
2,003qefgm,0.002522,80.0,200.0,0.4,netix_email_studio.png,1
3,0083okjl,0.116802,600.0,600.0,1.0,productshot2.jpg,0
4,00a957mh,0.140714,816.0,2382.0,0.342569,divine_medical_billing_inc_master.png,1


In [16]:
# Merge dataframes by attachment key
combined_df = df.merge(df_pixel_t, on='attachment_key')

In [17]:
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filename,logo,pixel_array
0,00xjny6u,0.011849,520.0,506.0,1.027668,Esterdale Theatre - Logo.png,1,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
1,04o31jop,0.01261,269.0,396.0,0.679293,BLUE_LOGO.png,1,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
2,0840h7ox,0.399114,518.0,920.0,0.563044,ZEN_LB-5.jpg,0,"[0.106121555, 0.05576863, 0.052235294, 0.09043..."
3,0awudx2h,0.011226,200.0,200.0,1.0,article central 200.jpg,1,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
4,0bb8y85h,0.263813,885.0,800.0,1.10625,1116-100_WGTank_white.jpg,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."


In [104]:
combined_df = combined_df[['attachment_key', 'size_mb', 'height', 'width', 'h_to_w',  'pixel_array', 'filename','logo']]
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,pixel_array,filename,logo
0,00xjny6u,0.011849,520.0,506.0,1.027668,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",Esterdale Theatre - Logo.png,1
1,04o31jop,0.01261,269.0,396.0,0.679293,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",BLUE_LOGO.png,1
2,0840h7ox,0.399114,518.0,920.0,0.563044,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",ZEN_LB-5.jpg,0
3,0awudx2h,0.011226,200.0,200.0,1.0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",article central 200.jpg,1
4,0bb8y85h,0.263813,885.0,800.0,1.10625,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",1116-100_WGTank_white.jpg,0


In [162]:
file_types = []
for item in combined_df['filename']:
    file_types.append(item[-3:].lower())

print(set(file_types))

#File types should be png, jpg or other
#Need to one hot encode these



{'ack', 'hot', 'lue', 'ite', '73e', 'png', 'jpg', 'pig', 'jpe', '0mm', 'peg', 'age'}


In [163]:
filetype_encoding = []
for item in combined_df['filename']:
    if item[-3:].lower() == 'jpg':
        filetype_encoding.append('jpg')
    elif item[-3:].lower() == 'png':
        filetype_encoding.append('png')
    else:
        filetype_encoding.append('other')
        

In [164]:
combined_df['filetype'] = filetype_encoding
combined_df = combined_df[['attachment_key', 'size_mb', 'height', 'width', 'h_to_w', 'filetype', 'pixel_array', 'filename','logo']]
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filetype,pixel_array,filename,logo
0,00xjny6u,0.011849,520.0,506.0,1.027668,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",Esterdale Theatre - Logo.png,1
1,04o31jop,0.01261,269.0,396.0,0.679293,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",BLUE_LOGO.png,1
2,0840h7ox,0.399114,518.0,920.0,0.563044,jpg,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",ZEN_LB-5.jpg,0
3,0awudx2h,0.011226,200.0,200.0,1.0,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",article central 200.jpg,1
4,0bb8y85h,0.263813,885.0,800.0,1.10625,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",1116-100_WGTank_white.jpg,0


In [165]:
# get dummies
filetype_dummies = pd.get_dummies(combined_df.filetype)

# add to initial df
combined_df = pd.concat((combined_df, filetype_dummies), axis=1)

# print head of df
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filetype,pixel_array,filename,logo,jpg,other,png
0,00xjny6u,0.011849,520.0,506.0,1.027668,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",Esterdale Theatre - Logo.png,1,0,0,1
1,04o31jop,0.01261,269.0,396.0,0.679293,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",BLUE_LOGO.png,1,0,0,1
2,0840h7ox,0.399114,518.0,920.0,0.563044,jpg,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",ZEN_LB-5.jpg,0,1,0,0
3,0awudx2h,0.011226,200.0,200.0,1.0,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",article central 200.jpg,1,1,0,0
4,0bb8y85h,0.263813,885.0,800.0,1.10625,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",1116-100_WGTank_white.jpg,0,1,0,0


In [166]:
combined_df = combined_df[['attachment_key', 'height', 'width', 'h_to_w', 'size_mb','jpg', 'png', 'other', 'pixel_array', 'filetype','filename','logo']]
combined_df.head()

Unnamed: 0,attachment_key,height,width,h_to_w,size_mb,jpg,png,other,pixel_array,filetype,filename,logo
0,00xjny6u,520.0,506.0,1.027668,0.011849,0,1,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",png,Esterdale Theatre - Logo.png,1
1,04o31jop,269.0,396.0,0.679293,0.01261,0,1,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",png,BLUE_LOGO.png,1
2,0840h7ox,518.0,920.0,0.563044,0.399114,1,0,0,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",jpg,ZEN_LB-5.jpg,0
3,0awudx2h,200.0,200.0,1.0,0.011226,1,0,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",jpg,article central 200.jpg,1
4,0bb8y85h,885.0,800.0,1.10625,0.263813,1,0,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",jpg,1116-100_WGTank_white.jpg,0


In [229]:
#Drop NA values
combined_df = combined_df.dropna()

In [231]:
Y = np.array(combined_df['logo'])
print(Y.shape)
print(Y[0])

(3754,)
1


In [299]:
#Features: h_to_w, size_mb
#X = combined_df.iloc[:3755, 3:5].values

#Features: pixel array
#X = np.array(combined_df['pixel_array'].tolist())

#Features: pixel, h_to_w, size_mb
# p1 = combined_df.iloc[:3755, 3:5].values
# p2 = np.array(combined_df['pixel_array'].tolist())
# X = np.concatenate((p1,p2), axis=1)

#Features: pixel, h_to_w, size_mb, file type
p1 = combined_df.iloc[:3755, 3:8].values
p2 = np.array(combined_df['pixel_array'].tolist())
X = np.concatenate((p1,p2), axis=1)

In [278]:
# x_test, y_test = X[2000:], Y[2000:]
# x_train, y_train = X[:1000], Y[:1000]

In [301]:
print(x_train.shape)
print(y_train.shape)

(2252, 150533)
(2252,)


In [302]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4,random_state=1, stratify=Y)

In [281]:
# #Try with standardizing data first
# # standardize the features (learn mean and variance from train data. Use these values to standardize 
# #both the train and test data!)
# sc = StandardScaler()

# #X
# sc.fit(x_train)
# x_train_std = sc.transform(x_train)
# x_test_std = sc.transform(x_test)

In [292]:
# # Try PCA

# # create an instance of the PCA class
# pca = PCA(n_components=5)

# # fit the data using the original X_train_std data; tranform X_train_std data
# x_train_pca = pca.fit_transform(x_train_std)

# # transform the X_test_std data
# x_test_pca = pca.transform(x_test_std)

In [303]:
#this specifies what kind of layers are going to be present in the neural net
#Begins the linear arrangment of layers
model = tf.keras.models.Sequential([
    
  #tf.keras.layers.Flatten(input_shape=(150528, 1)),
    
#Creates a fully-connected layer. A layer is fully connected to the layer that preceds it.
#128 nodes in the layer
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(2, activation='softmax') 
])

In [304]:
model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.

In [305]:
model.compile(optimizer='SGD',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


#Other optimizers to try - SGD

In [306]:
#Epoch = number of times the model sees all of the training
model.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbb8b3e2d30>

In [308]:
model.evaluate(x_test, y_test)



[0.49699220061302185, 0.782956063747406]