In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA


In [4]:
# Set working directory
os.chdir('/Users/alyssaaugsburger/Documents/Berkeley_MIDS/W207/Final_Project')

In [5]:
# Load first npz file as an example and get files
data_1 = np.load('et_w207_project_npz_files_5000_tmp_tmp5ocrhjnn.npz')
data_1.close
data_1.files

['arr_0', 'arr_1']

In [4]:
# Show file arr_0 example
data_1['arr_0'][:5]

array(['ozbpir-9ey6js-ggdqwo', '378ykanq', 'p366md-big834-7o23k6',
       '3zruwvl2', 'q18iae-3vnh74-79npmy'], dtype='<U20')

In [5]:
# Show file arr_1 example
data_1['arr_1'][:5]

array([[-103.939   , -116.779   , -123.68    , ..., -103.939   ,
        -116.779   , -123.68    ],
       [ -46.939003,  -74.779   , -118.68    , ...,  -57.939003,
         -84.779   , -118.68    ],
       [ 151.061   ,  138.22101 ,  131.32    , ...,  151.061   ,
         138.22101 ,  131.32    ],
       [  -3.939003,  -66.779   ,  -96.68    , ...,   -3.939003,
         -66.779   ,  -96.68    ],
       [-103.939   , -116.779   , -123.68    , ..., -103.939   ,
        -116.779   , -123.68    ]], dtype=float32)

In [6]:
# Load all data files
data_2 = np.load('et_w207_project_npz_files_5000_tmp_tmp092sag67.npz')
data_2.close
print(data_2.files)

data_3 = np.load('et_w207_project_npz_files_5000_tmp_tmpf31_pn8p.npz')
data_3.close
print(data_3.files)

data_4 = np.load('et_w207_project_npz_files_5000_tmp_tmpq5b2g4n2.npz')
data_4.close
print(data_4.files)

['arr_0', 'arr_1']
['arr_0', 'arr_1']
['arr_0', 'arr_1']


In [7]:
# Get size of each file
print(data_1['arr_0'].shape)
print(data_1['arr_1'].shape)

(878,)
(878, 150528)


In [8]:
print(data_2['arr_0'].shape)
print(data_2['arr_1'].shape)

(893,)
(893, 150528)


In [9]:
print(data_3['arr_0'].shape)
print(data_3['arr_1'].shape)

(638,)
(638, 150528)


In [10]:
print(data_4['arr_0'].shape)
print(data_4['arr_1'].shape)

(1353,)
(1353, 150528)


In [7]:
# Concatenate image vectors from all files
data_arr0_all = np.concatenate((data_1['arr_0'], data_2['arr_0'], data_3['arr_0'], data_4['arr_0']))
data_arr0_all.shape

(3762,)

In [8]:
data_arr1_all = np.concatenate((data_1['arr_1'], data_2['arr_1'], data_3['arr_1'], data_4['arr_1']))
data_arr1_all.shape

(3762, 150528)

In [9]:
# Store pixel vectors as DF with attachment key

# Create key value pairs with arr0 (attachment key) and standardized arr1 (pixel array)
# Standardize pixel values between 0 and 1 by dividing by 255
data_dict = {}
for i in range(data_arr0_all.shape[0]):
    data_dict[data_arr0_all[i]] = [data_arr1_all[i]/255]

df_pixel = pd.DataFrame(data=data_dict)
df_pixel_t = df_pixel.transpose()
df_pixel_t.index.names = ['attachment_key']
df_pixel_t.columns = ['pixel_array']
df_pixel_t

Unnamed: 0_level_0,pixel_array
attachment_key,Unnamed: 1_level_1
ozbpir-9ey6js-ggdqwo,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
378ykanq,"[-0.18407452, -0.29325098, -0.46541175, -0.097..."
p366md-big834-7o23k6,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
3zruwvl2,"[-0.015447071, -0.26187843, -0.37913725, -0.01..."
q18iae-3vnh74-79npmy,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
q0nyct-7525ow-3ur3ii,"[0.5767098, 0.5263569, 0.4992941, 0.49043527, ..."
7d8atetr,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
ppfy71-bfbnm8-emjouu,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
pe7vgl-2vda1s-fo18uc,"[-0.40760392, -0.45795685, -0.4850196, -0.4076..."
q7clmn-75pmlk-9pzb8r,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."


In [10]:
# Load excel data
df = pd.read_csv('school_project_data_set-query.csv')
df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filename,logo
0,001lq61k,0.12252,514.0,720.0,0.713889,campus ministry on the beach.jpg,0
1,002dlirq,3.256944,4072.0,3868.0,1.052741,BIG ALS FISH FLASH UV VP PL.jpg,0
2,003qefgm,0.002522,80.0,200.0,0.4,netix_email_studio.png,1
3,0083okjl,0.116802,600.0,600.0,1.0,productshot2.jpg,0
4,00a957mh,0.140714,816.0,2382.0,0.342569,divine_medical_billing_inc_master.png,1


In [11]:
# Merge dataframes by attachment key
combined_df = df.merge(df_pixel_t, on='attachment_key')

In [12]:
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filename,logo,pixel_array
0,00xjny6u,0.011849,520.0,506.0,1.027668,Esterdale Theatre - Logo.png,1,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
1,04o31jop,0.01261,269.0,396.0,0.679293,BLUE_LOGO.png,1,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
2,0840h7ox,0.399114,518.0,920.0,0.563044,ZEN_LB-5.jpg,0,"[0.106121555, 0.05576863, 0.052235294, 0.09043..."
3,0awudx2h,0.011226,200.0,200.0,1.0,article central 200.jpg,1,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."
4,0bb8y85h,0.263813,885.0,800.0,1.10625,1116-100_WGTank_white.jpg,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,..."


In [13]:
combined_df = combined_df[['attachment_key', 'size_mb', 'height', 'width', 'h_to_w',  'pixel_array', 'filename','logo']]
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,pixel_array,filename,logo
0,00xjny6u,0.011849,520.0,506.0,1.027668,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",Esterdale Theatre - Logo.png,1
1,04o31jop,0.01261,269.0,396.0,0.679293,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",BLUE_LOGO.png,1
2,0840h7ox,0.399114,518.0,920.0,0.563044,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",ZEN_LB-5.jpg,0
3,0awudx2h,0.011226,200.0,200.0,1.0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",article central 200.jpg,1
4,0bb8y85h,0.263813,885.0,800.0,1.10625,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",1116-100_WGTank_white.jpg,0


In [14]:
#Feature is just the pixel array
X = np.array(combined_df['pixel_array'].tolist())
print(X.shape)

(3755, 150528)


In [15]:
#Features are size, height, width, h_to_w
X2 = combined_df.iloc[:3755, 1:5].values
print(X2[0])
print(X2.shape)

[1.184900e-02 5.200000e+02 5.060000e+02 1.027668e+00]
(3755, 4)


In [16]:
#Features are size, height, width, h_to_w, pixel_array
X3 = np.concatenate((X2,X), axis=1)
print(X3.shape)
print(X3[0][0])

(3755, 150532)
0.011849


In [18]:
file_types = []
for item in combined_df['filename']:
    file_types.append(item[-3:].lower())

print(set(file_types))

#File types should be png, jpg or other
#Need to one hot encode these



{'peg', 'ack', 'jpg', 'hot', 'png', 'lue', 'ite', 'jpe', '0mm', 'age', '73e', 'pig'}


In [19]:
filetype_encoding = []
for item in combined_df['filename']:
    if item[-3:].lower() == 'jpg':
        filetype_encoding.append('jpg')
    elif item[-3:].lower() == 'png':
        filetype_encoding.append('png')
    else:
        filetype_encoding.append('other')
        

In [20]:
combined_df['filetype'] = filetype_encoding
combined_df = combined_df[['attachment_key', 'size_mb', 'height', 'width', 'h_to_w', 'filetype', 'pixel_array', 'filename','logo']]
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filetype,pixel_array,filename,logo
0,00xjny6u,0.011849,520.0,506.0,1.027668,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",Esterdale Theatre - Logo.png,1
1,04o31jop,0.01261,269.0,396.0,0.679293,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",BLUE_LOGO.png,1
2,0840h7ox,0.399114,518.0,920.0,0.563044,jpg,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",ZEN_LB-5.jpg,0
3,0awudx2h,0.011226,200.0,200.0,1.0,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",article central 200.jpg,1
4,0bb8y85h,0.263813,885.0,800.0,1.10625,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",1116-100_WGTank_white.jpg,0


In [21]:
# get dummies
filetype_dummies = pd.get_dummies(combined_df.filetype)

# add to initial df
combined_df = pd.concat((combined_df, filetype_dummies), axis=1)

# print head of df
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,filetype,pixel_array,filename,logo,jpg,other,png
0,00xjny6u,0.011849,520.0,506.0,1.027668,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",Esterdale Theatre - Logo.png,1,0,0,1
1,04o31jop,0.01261,269.0,396.0,0.679293,png,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",BLUE_LOGO.png,1,0,0,1
2,0840h7ox,0.399114,518.0,920.0,0.563044,jpg,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",ZEN_LB-5.jpg,0,1,0,0
3,0awudx2h,0.011226,200.0,200.0,1.0,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",article central 200.jpg,1,1,0,0
4,0bb8y85h,0.263813,885.0,800.0,1.10625,jpg,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",1116-100_WGTank_white.jpg,0,1,0,0


In [22]:
combined_df = combined_df[['attachment_key', 'size_mb', 'height', 'width', 'h_to_w', 'jpg', 'png', 'other', 'pixel_array', 'filetype','filename','logo']]
combined_df.head()

Unnamed: 0,attachment_key,size_mb,height,width,h_to_w,jpg,png,other,pixel_array,filetype,filename,logo
0,00xjny6u,0.011849,520.0,506.0,1.027668,0,1,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",png,Esterdale Theatre - Logo.png,1
1,04o31jop,0.01261,269.0,396.0,0.679293,0,1,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",png,BLUE_LOGO.png,1
2,0840h7ox,0.399114,518.0,920.0,0.563044,1,0,0,"[0.106121555, 0.05576863, 0.052235294, 0.09043...",jpg,ZEN_LB-5.jpg,0
3,0awudx2h,0.011226,200.0,200.0,1.0,1,0,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",jpg,article central 200.jpg,1
4,0bb8y85h,0.263813,885.0,800.0,1.10625,1,0,0,"[0.5923961, 0.54204315, 0.51498044, 0.5923961,...",jpg,1116-100_WGTank_white.jpg,0


In [23]:
#Features are size, height, width, h_to_w, filetype
X4 = combined_df.iloc[:3755, 1:8].values
print(X4[0])
print(X4.shape)

[1.184900e-02 5.200000e+02 5.060000e+02 1.027668e+00 0.000000e+00
 1.000000e+00 0.000000e+00]
(3755, 7)


In [24]:
#Features are size, height, width, h_to_w, filetype, pixel array
X5 = np.concatenate((X4,X), axis=1)
print(X5.shape)
print(X5[0][0])

(3755, 150535)
0.011849


In [25]:
Y = np.array(combined_df['logo'])
print(Y.shape)

(3755,)


In [26]:
# STILL WORKING
# Split into train and test data
# Development data?
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,random_state=1, stratify=y)

In [27]:
# # Set some variables to hold test and training data.
test_data, test_labels = X[2000:], Y[2000:]
# dev_data, dev_labels = X[60000:61000], Y[60000:61000]
train_data, train_labels = X[:2000], Y[:2000]
mini_train_data, mini_train_labels = X[:100], Y[:100]


#Try with more features - size, height, width, h_to_w
mini_train_data2, mini_train_labels2 = X2[:100], Y[:100]
test_data2, test_labels2 = X2[2000:], Y[2000:]

#Try with more features - size, height, width, h_to_w, pixel array
mini_train_data3, mini_train_labels3 = X3[:100], Y[:100]
test_data3, test_labels3 = X3[2000:], Y[2000:]

#Try with more features - size, height, width, h_to_w, filetype
mini_train_data4, mini_train_labels4 = X4[:100], Y[:100]
test_data4, test_labels4 = X4[2000:], Y[2000:]

#Try with more features - size, height, width, h_to_w, filetype, pixel array
mini_train_data5, mini_train_labels5 = X5[:100], Y[:100]
test_data5, test_labels5 = X5[2000:], Y[2000:]


In [28]:
# Standardize

In [29]:
#Try a K Nearest Neighbors model just to see if everything is working correctly 
#Only using pixel data as a feature
#Train the data
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_data, train_labels)

#Predict
dev_pred = knn.predict(test_data)
    
print('Misclassified examples: %d' % (test_labels != dev_pred).sum())
    
error = (test_labels != dev_pred).sum()/len(dev_pred)
print('Misclassification error: %.3f' % error)
print('Accuracy: %.3f'% (1-error))
    

KeyboardInterrupt: 

In [53]:
#Logistic regression with only pixel array as a feature
lr = LogisticRegression(penalty='l2', C = .3, solver='liblinear', multi_class='auto')
lr.fit(mini_train_data, mini_train_labels)
        
#Predict
pred_lr = lr.predict(test_data)

print('Misclassified examples: %d' % (test_labels != pred_lr).sum())
    
error = (test_labels != pred_lr).sum()/len(pred_lr)
print('Misclassification error: %.3f' % error)
print('Accuracy: %.3f'% (1-error))

Misclassified examples: 652
Misclassification error: 0.372
Accuracy: 0.628


In [83]:
#Features are size, height, width, h_to_w
lr2 = LogisticRegression(penalty='l2', C = 2, solver='liblinear', multi_class='auto')
lr2.fit(mini_train_data2, mini_train_labels2)
        
#Predict
pred_lr2 = lr2.predict(test_data2)

print('Misclassified examples: %d' % (test_labels2 != pred_lr2).sum())
    
error2 = (test_labels2 != pred_lr2).sum()/len(pred_lr2)
print('Misclassification error: %.3f' % error2)
print('Accuracy: %.3f'% (1-error2))

Misclassified examples: 421
Misclassification error: 0.240
Accuracy: 0.760


In [84]:
#Features are size, height, width, h_to_w, pixel array
lr3 = LogisticRegression(penalty='l2', C = 2, solver='liblinear', multi_class='auto')
lr3.fit(mini_train_data3, mini_train_labels3)
        
#Predict
pred_lr3 = lr3.predict(test_data3)

print('Misclassified examples: %d' % (test_labels3 != pred_lr3).sum())
    
error3 = (test_labels3 != pred_lr3).sum()/len(pred_lr3)
print('Misclassification error: %.3f' % error3)
print('Accuracy: %.3f'% (1-error3))

Misclassified examples: 602
Misclassification error: 0.343
Accuracy: 0.657


In [126]:
#Features are size, height, width, h_to_w, filetype
lr4 = LogisticRegression(penalty='l2', C = 2, solver='liblinear', multi_class='auto')
lr4.fit(mini_train_data4, mini_train_labels4)
        
#Predict
pred_lr4 = lr4.predict(test_data4)

print('Misclassified examples: %d' % (test_labels4 != pred_lr4).sum())
    
error4 = (test_labels4 != pred_lr4).sum()/len(pred_lr4)
print('Misclassification error: %.3f' % error4)
print('Accuracy: %.3f'% (1-error4))

Misclassified examples: 368
Misclassification error: 0.210
Accuracy: 0.790


In [130]:
#Features are size, height, width, h_to_w, filetype, pixel array
lr5 = LogisticRegression(penalty='l2', C = 2, solver='liblinear', multi_class='auto')
lr5.fit(mini_train_data5, mini_train_labels5)
        
#Predict
pred_lr5 = lr5.predict(test_data5)

print('Misclassified examples: %d' % (test_labels5 != pred_lr5).sum())
    
error5 = (test_labels5 != pred_lr5).sum()/len(pred_lr5)
print('Misclassification error: %.3f' % error5)
print('Accuracy: %.3f'% (1-error5))

Misclassified examples: 602
Misclassification error: 0.343
Accuracy: 0.657


In [74]:
# create an instance of the PCA class
pca = PCA(n_components=10)

#mini_train_data_transformed = mini_train_data.reshape(1,-1)
#mini_train_labels_transformed = mini_train_labels.reshape(1,-1)

# fit the data using the original X_train_std data; tranform X_train_std data
X_train_pca_l = pca.fit_transform(mini_train_data5)

# transform the X_test_std data
X_test_pca_l = pca.transform(test_data5)

# #Logistic regression with only pixel array as a feature
lr6 = LogisticRegression(penalty='l2', C = .3, solver='liblinear', multi_class='auto')
lr6.fit(X_train_pca_l, mini_train_labels5)
        
# #Predict
pred_lr6 = lr6.predict(X_test_pca_l)

print('Misclassified examples: %d' % (test_labels5 != pred_lr6).sum())
    
error6 = (test_labels5 != pred_lr6).sum()/len(pred_lr6)
print('Misclassification error: %.3f' % error6)
print('Accuracy: %.3f'% (1-error6))


Misclassified examples: 414
Misclassification error: 0.236
Accuracy: 0.764


###### 