In [1]:
#Detecting Labels,logo
import argparse
import io
import os
import pandas as pd
import types
import numpy as np
import time
from google.cloud import vision
from google.oauth2 import service_account

# Image labels detection

In [2]:
def detect_labels_image(images_path, creds_path):
    """Detects labels in the file."""
    credentials = service_account.Credentials.from_service_account_file(creds_path)
    scoped_credentials = credentials.with_scopes(
        ['https://www.googleapis.com/auth/cloud-platform'])
    client = vision.ImageAnnotatorClient(credentials=credentials)
    #input the maximum
    max=6
    filenames = [filename for filename in os.listdir(images_path) if 'DS_Store' not in filename]    
    index=0
    ind=0
    ite_num=len(filenames)//max
    structured=[]
    for ite in range(ite_num+1):
        if ite!=ite_num:
            file_name=[file for file in filenames[index:index+max]]
            index=index+max
        else: 
            file_name=[file for file in filenames[max*ite_num:]]
        
        
        requests = []
        for filename in file_name: 
            with io.open(images_path+filename, 'rb') as image_file:
                content = image_file.read()
                requests.append({
                    "image":{"content":content},
                    "features":[
                        {"type":"LABEL_DETECTION"}
                    ]})
        resp = client.batch_annotate_images(requests)
        response = resp.responses
        for num in range(len(response)):
            label=response[num].label_annotations
            for la in label:
                structured.append([filenames[ind],la.description,la.score])
            ind+=1


    return structured

# Image faces detection

In [652]:
def detect_faces_image(images_path, creds_path):
    """Detects labels in the file."""
    credentials = service_account.Credentials.from_service_account_file(creds_path)
    scoped_credentials = credentials.with_scopes(
        ['https://www.googleapis.com/auth/cloud-platform'])
    client = vision.ImageAnnotatorClient(credentials=credentials)
    #input the maximum
    max=6
    filenames = [filename for filename in os.listdir(images_path) if 'DS_Store' not in filename]    
    index=0
    ind=0
    id=0
    ite_num=len(filenames)//max
    structured=[]
    for ite in range(ite_num+1):
        if ite!=ite_num:
            file_name=[file for file in filenames[index:index+max]]
            index=index+max
        else: 
            file_name=[file for file in filenames[max*ite_num:]]
        
        
        requests = []
        for filename in file_name: 
            with io.open(images_path+filename, 'rb') as image_file:
                content = image_file.read()
                requests.append({
                    "image":{"content":content},
                    "features":[
                        {"type":"FACE_DETECTION"}
                    ]})
        resp = client.batch_annotate_images(requests)
        response = resp.responses
        
        for i in range(len(response)):
            if not response[i].face_annotations:
                structured.append([filenames[ind],0])
                ind+=1
            else:
                re=response[i].face_annotations
                count=0
                for l in re:
                    count+=1
                structured.append([filenames[ind],count])
                ind+=1
               
    return(structured)



In [651]:
start_time = time.time()
stories_image=detect_faces_image('/Users/gary.zhou/Desktop/test/Stories_images/','/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))

--- 450.16543889045715 seconds ---


In [653]:
start_time = time.time()
In_feed_image=detect_faces_image('/Users/gary.zhou/Desktop/test/In_feed_images/','/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))

--- 785.5318791866302 seconds ---


In [654]:
df_stories_image=pd.DataFrame(stories_image,columns=['filename','count'])
df_stories_image['source']='Stories'
df_stories_image['file type']='image'

In [655]:
df_In_feed_image=pd.DataFrame(In_feed_image,columns=['filename','count'])
df_In_feed_image['source']='In_feed'
df_In_feed_image['file type']='image'

# Video labels detection

In [4]:
def detect_labels_video(images_path, creds_path):
    credentials = service_account.Credentials.from_service_account_file(creds_path)
    scoped_credentials = credentials.with_scopes(['https://www.googleapis.com/auth/cloud-platform'])
    client = vision.ImageAnnotatorClient(credentials=credentials)
    
    image_name=[filename for filename in os.listdir(images_path) if filename.endswith('.jpg')]
    video_name=[filename for filename in os.listdir(images_path) if filename.endswith('.mp4')]
    
    filenames =image_name   
    max=8
    id=0
    ind=0
    index=0
    ind=0
    video_index=0
    ite_num=len(filenames)//max
    structured=[]
    
    for ite in range(ite_num+1):
        if ite!=ite_num:
            file_name=[file for file in filenames[index:index+max]]
            index=index+max
        else: 
            file_name=[file for file in filenames[max*ite_num:]]

        requests=[]
        for filename in file_name: 
            with io.open(images_path+filename, 'rb') as image_file:
                content = image_file.read()
                requests.append({
                    "image":{"content":content},
                    "features":[
                        {"type":"LABEL_DETECTION"}
                    ]})
        resp = client.batch_annotate_images(requests)
        response = resp.responses
        
        for i in range(len(response)):
            if image_name[ind][:-8]!=video_name[video_index]:
                video_index+=1
            label=response[i].label_annotations
            id=id+1
            for la in label:
                structured.append([video_index,video_name[video_index],la.description,la.score])
            ind+=1
                

                
    #combine group images to single row           
    df_image=pd.DataFrame(structured,columns=['video_index','filename','object','probability'])
    return(df_image)    
    

In [749]:
dftest=detect_labels_video('/Users/gary.zhou/Desktop/test/test/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')
        

# Video faces detection

In [12]:
def detect_faces_video(images_path, creds_path):
    credentials = service_account.Credentials.from_service_account_file(creds_path)
    scoped_credentials = credentials.with_scopes(['https://www.googleapis.com/auth/cloud-platform'])
    client = vision.ImageAnnotatorClient(credentials=credentials)
    
    image_name=[filename for filename in os.listdir(images_path) if filename.endswith('.jpg')]
    video_name=[filename for filename in os.listdir(images_path) if filename.endswith('.mp4')]
    
    filenames =image_name   
    max=8
    index=0
    ind=0
    video_index=0
    ite_num=len(filenames)//max
    structured=[]
    
    for ite in range(ite_num+1):
        if ite!=ite_num:
            file_name=[file for file in filenames[index:index+max]]
            index=index+max
        else: 
            file_name=[file for file in filenames[max*ite_num:]]

        requests=[]
        for filename in file_name: 
            with io.open(images_path+filename, 'rb') as image_file:
                content = image_file.read()
                requests.append({
                    "image":{"content":content},
                    "features":[
                        {"type":"FACE_DETECTION"}
                    ]})
        resp = client.batch_annotate_images(requests)
        response = resp.responses
        
        for i in range(len(response)):
            if image_name[ind][:-8]!=video_name[video_index]:
                video_index+=1
                
            if not response[i].face_annotations:
                structured.append([image_name[ind],0,video_index])
                ind+=1
            else:
                re=response[i].face_annotations
                count=0
                for l in re:
                    count+=1
                structured.append([image_name[ind],count,video_index])
                ind+=1
                
    #combine group images to single row           
    df_image=pd.DataFrame(structured,columns=['name','count','video_index'])
    df_video=pd.DataFrame(video_name,columns=['name'])
    agg=df_image[df_image['count']>0].groupby('video_index')['count'].mean()
    df_agg=pd.Series.to_frame(agg)
    df=pd.concat([df_video,df_agg],axis=1,join='outer') 
    
    return(df.fillna(0).round())    
    

In [13]:
start_time = time.time()
df_stories_video=detect_faces_video('/Users/gary.zhou/Desktop/test/Stories_videos/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))   

--- 339.67152094841003 seconds ---


In [19]:
start_time = time.time()
df_In_feed_videos=detect_faces_video('/Users/gary.zhou/Desktop/test/In_feed_videos/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))   

--- 727.8709168434143 seconds ---


In [21]:
df_stories_video
df_stories_video['source']='Stories'
df_stories_video['file type']='videos'

In [22]:
df_In_feed_videos
df_In_feed_videos['source']='In_feed'
df_In_feed_videos['file type']='videos'

# Label_Data

In [5]:
start_time = time.time()
in_feed_images=detect_labels_image('/Users/gary.zhou/Desktop/test/In_feed_images/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))



--- 3420.898010969162 seconds ---


In [7]:
start_time = time.time()
stories_images=detect_labels_image('/Users/gary.zhou/Desktop/test/Stories_images/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))

--- 906.4998128414154 seconds ---


In [8]:
in_feed_images_label=pd.DataFrame(in_feed_images,columns=['filename','object','prob'])
stories_images_label=pd.DataFrame(stories_images,columns=['filename','object','prob'])

In [10]:

start_time = time.time()
in_feed_videos_label=detect_labels_video('/Users/gary.zhou/Desktop/test/In_feed_videos/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))

--- 2043.4189639091492 seconds ---


In [11]:
start_time = time.time()
stories_videos_label=detect_labels_video('/Users/gary.zhou/Desktop/test/Stories_videos/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')
print("--- %s seconds ---" % (time.time() - start_time))

--- 820.2081847190857 seconds ---


In [753]:
in_feed_videos_label

Unnamed: 0,video_index,filename,object,probability
0,0,18577604_1892331500978391_409035823229435904_n...,photo caption,0.649242
1,0,18577604_1892331500978391_409035823229435904_n...,soldier,0.527210
2,0,18577604_1892331500978391_409035823229435904_n...,photo caption,0.693162
3,0,18577604_1892331500978391_409035823229435904_n...,soldier,0.543474
4,0,18577604_1892331500978391_409035823229435904_n...,tree,0.883579
5,0,18577604_1892331500978391_409035823229435904_n...,grass,0.648155
6,0,18577604_1892331500978391_409035823229435904_n...,photo caption,0.532787
7,0,18577604_1892331500978391_409035823229435904_n...,plant,0.522594
8,0,18577604_1892331500978391_409035823229435904_n...,tree,0.883182
9,0,18577604_1892331500978391_409035823229435904_n...,grass,0.726598


# Analysis(Face detection)

In [243]:
output5 = detect_labels_image('/Users/gary.zhou/Desktop/test/In_feed_videos/', '/Users/gary.zhou/Documents/cloud_vision/service_account.json')




In [244]:
df5=pd.DataFrame(output5,columns=['filename','object','probability'])
#df.head(100)


In [659]:
df_video=pd.concat([df_In_feed_videos,df_stories_video])
df_image=pd.concat([df_In_feed_image,df_stories_image])
df_video.rename(columns={"name":"filename"},inplace=True)
df_adidas=pd.concat([df_video,df_image])
df_adidas=df_adidas.reset_index()
df_adidas.drop(columns={'index'},inplace=True)

df_adidas_faces=df_adidas[df_adidas['count']>0]
df_adidas_faces.groupby(['source','file type']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
source,file type,Unnamed: 2_level_1
In_feed,image,2404.0
In_feed,videos,410.0
Stories,image,1246.0
Stories,videos,168.0


In [657]:
df_adidas_faces

Unnamed: 0,filename,count,source,file type
0,18577604_1892331500978391_409035823229435904_n...,1.0,In_feed,videos
3,20709060_1373290559442300_4163585733851900849_...,1.0,In_feed,videos
17,26190818_180139079404609_8534508039045120000_n...,6.0,In_feed,videos
32,26823437_880174705493555_6771469134208172032_n...,1.0,In_feed,videos
37,26879457_2012264632120224_2718545514743201792_...,2.0,In_feed,videos
42,27061180_1405322946263630_8348568485210619904_...,1.0,In_feed,videos
52,27079384_1818168901814480_5529524652751192064_...,1.0,In_feed,videos
53,27295759_545101369197762_1955576524870516736_n...,3.0,In_feed,videos
57,27305213_1809560672677867_372440618808377344_n...,4.0,In_feed,videos
60,27308242_146405982709778_666194576305815552_n.mp4,2.0,In_feed,videos


# Analysis (label)

In [12]:
image_label=pd.concat([in_feed_images_label,stories_images_label])

In [13]:
image_label

Unnamed: 0,filename,object,prob
0,24175506_169512276977734_5210213780121714688_n...,t shirt,0.959756
1,24175506_169512276977734_5210213780121714688_n...,footwear,0.911549
2,24175506_169512276977734_5210213780121714688_n...,sportswear,0.885956
3,24175506_169512276977734_5210213780121714688_n...,shoulder,0.797446
4,24175506_169512276977734_5210213780121714688_n...,shoe,0.791966
5,24175506_169512276977734_5210213780121714688_n...,sleeve,0.765466
6,24175506_169512276977734_5210213780121714688_n...,outerwear,0.752519
7,24175506_169512276977734_5210213780121714688_n...,product,0.692231
8,24175506_169512276977734_5210213780121714688_n...,recreation,0.662025
9,24175506_169512276977734_5210213780121714688_n...,material,0.603908


In [14]:
def combine_images(df):
    duplicate=[]
    video_list=[]
    video_index=0
    Object=''
    for i in range(len(df)):
        Object=df['object'].loc[i]
        if df['video_index'].loc[i]!=video_index:
            video_index+=1
            duplicate=[]
        if (df['probability'].loc[i]>0.6) & (Object not in duplicate):
            duplicate.append(df['object'].loc[i])
            video_list.append([video_index,df['filename'].loc[i],df['object'].loc[i],df['probability'].loc[i]])
            
    df_final=pd.DataFrame(video_list,columns=['id','filename','object','prob'])
    return df_final
        

In [15]:
df_01=combine_images(in_feed_videos_label)


In [16]:
df_02=combine_images(stories_videos_label)

In [17]:
video_label=pd.concat([df_01,df_02])

In [19]:
video_label=video_label.drop(columns=['id'])

In [69]:
df_f=pd.concat([image_label,video_label])

In [70]:
df_f

Unnamed: 0,filename,object,prob
0,24175506_169512276977734_5210213780121714688_n...,t shirt,0.959756
1,24175506_169512276977734_5210213780121714688_n...,footwear,0.911549
2,24175506_169512276977734_5210213780121714688_n...,sportswear,0.885956
3,24175506_169512276977734_5210213780121714688_n...,shoulder,0.797446
4,24175506_169512276977734_5210213780121714688_n...,shoe,0.791966
5,24175506_169512276977734_5210213780121714688_n...,sleeve,0.765466
6,24175506_169512276977734_5210213780121714688_n...,outerwear,0.752519
7,24175506_169512276977734_5210213780121714688_n...,product,0.692231
8,24175506_169512276977734_5210213780121714688_n...,recreation,0.662025
9,24175506_169512276977734_5210213780121714688_n...,material,0.603908


In [71]:
df_f=df_f.reset_index()

In [72]:
df_f=df_f.drop(columns=['index'])

In [73]:
df_f

Unnamed: 0,filename,object,prob
0,24175506_169512276977734_5210213780121714688_n...,t shirt,0.959756
1,24175506_169512276977734_5210213780121714688_n...,footwear,0.911549
2,24175506_169512276977734_5210213780121714688_n...,sportswear,0.885956
3,24175506_169512276977734_5210213780121714688_n...,shoulder,0.797446
4,24175506_169512276977734_5210213780121714688_n...,shoe,0.791966
5,24175506_169512276977734_5210213780121714688_n...,sleeve,0.765466
6,24175506_169512276977734_5210213780121714688_n...,outerwear,0.752519
7,24175506_169512276977734_5210213780121714688_n...,product,0.692231
8,24175506_169512276977734_5210213780121714688_n...,recreation,0.662025
9,24175506_169512276977734_5210213780121714688_n...,material,0.603908


In [74]:
def search_bucket(df):
    shoes=['shoe','shoes','footwear','outdoor shoe','sneakers','walking shoe','sportswear','athletic shoe','running shoe','tennis shoe','boot']
    apparel=['apparel','outerwear','t shirt','sleeve','jeans','jacket','clothing','trousers','apparel']
    color=['color','blue','black','purple','red','yellow','black and white','white','pink','green','electric blue','violet','orange','magenta']
    concert=['concert','rock concert']
    photograph=['snapshot','photography','monochrome photography']
    new_object=[]
    for row in df.itertuples():
        if row.object in shoes:
            new_object.append('shoes')
            continue
        if row.object in apparel:
            new_object.append('apparel')
            continue
        if row.object in color:
            new_object.append('color')
            continue
        if row.object in concert:
            new_object.append('concert')
            continue
        if row.object in photograph:
            new_object.append('photograph')
            continue
        new_object.append(row.object)
    df['object']=pd.Series(new_object)
    return df
    
    
    
    

In [75]:
df_bucket=search_bucket(df_f)

In [904]:
df_anal=df_bucket.drop_duplicates(['filename','object'])
aaa=df_anal[df_anal.prob>0.70].groupby('object').count().sort_values('filename',ascending=False)
aaa=aaa.reset_index()
aaa[aaa['object']=='shoes']

Unnamed: 0,object,filename,prob
1,shoes,1810,1810


In [77]:
df_ana=df_bucket

In [795]:
#result=df_ana[df_ana.prob>0.70].groupby('object').count().sort_values('filename',ascending=False)


In [906]:
#df_ana[df_ana.prob>0.70].groupby('object').count().sort_values('filename',ascending=False)


In [78]:
video_ind=0
ind_row=[]
video=df_ana['filename'].loc[0]
for row in df_ana.itertuples():
    if row.filename!=video:
        video=row.filename
        video_ind+=1
    ind_row.append(video_ind)

    


In [80]:
df_ana

Unnamed: 0,filename,object,prob
0,24175506_169512276977734_5210213780121714688_n...,apparel,0.959756
1,24175506_169512276977734_5210213780121714688_n...,shoes,0.911549
2,24175506_169512276977734_5210213780121714688_n...,shoes,0.885956
3,24175506_169512276977734_5210213780121714688_n...,shoulder,0.797446
4,24175506_169512276977734_5210213780121714688_n...,shoes,0.791966
5,24175506_169512276977734_5210213780121714688_n...,apparel,0.765466
6,24175506_169512276977734_5210213780121714688_n...,apparel,0.752519
7,24175506_169512276977734_5210213780121714688_n...,product,0.692231
8,24175506_169512276977734_5210213780121714688_n...,recreation,0.662025
9,24175506_169512276977734_5210213780121714688_n...,material,0.603908


In [81]:
se2=pd.Series(ind_row)
df_ana['ind']=se2
df_ori=df_ana.drop_duplicates(['filename','object'])

In [83]:
ana=df_ori[df_ori.prob>0.75].groupby('object').count().sort_values('filename',ascending=False)



In [86]:
df_ori.to_csv('label_ori.csv')

In [82]:
df_ori=df_ori[df_ori.prob>0.75]
df_ori


Unnamed: 0,filename,object,prob,ind
0,24175506_169512276977734_5210213780121714688_n...,apparel,0.959756,0
1,24175506_169512276977734_5210213780121714688_n...,shoes,0.911549,0
3,24175506_169512276977734_5210213780121714688_n...,shoulder,0.797446,0
10,25036659_1527256927323801_6735676178138398720_...,color,0.949924,1
11,25036659_1527256927323801_6735676178138398720_...,text,0.947646,1
13,25036659_1527256927323801_6735676178138398720_...,font,0.884107,1
20,25037016_253548621851555_3636738028602392576_n...,music artist,0.864889,2
27,25038595_1618907854862436_2293914648082120704_...,landmark,0.921739,3
28,25038595_1618907854862436_2293914648082120704_...,building,0.910864,3
29,25038595_1618907854862436_2293914648082120704_...,night,0.909465,3


Unnamed: 0,filename,object,prob,ind,new_object,final_object
10,25036659_1527256927323801_6735676178138398720_...,color,0.949924,1,color,color
11,25036659_1527256927323801_6735676178138398720_...,text,0.947646,1,text,text
13,25036659_1527256927323801_6735676178138398720_...,font,0.884107,1,font,font


Unnamed: 0,filename,object,prob,ind,new_object,final_object
0,24175506_169512276977734_5210213780121714688_n...,apparel,0.959756,0,apparel,apparel
1,24175506_169512276977734_5210213780121714688_n...,shoes,0.911549,0,shoes,shoes
3,24175506_169512276977734_5210213780121714688_n...,shoulder,0.797446,0,shoulder,shoulder
10,25036659_1527256927323801_6735676178138398720_...,color,0.949924,1,color,color
11,25036659_1527256927323801_6735676178138398720_...,text,0.947646,1,text,text
13,25036659_1527256927323801_6735676178138398720_...,font,0.884107,1,font,font
20,25037016_253548621851555_3636738028602392576_n...,music artist,0.864889,2,music artist,music artist
27,25038595_1618907854862436_2293914648082120704_...,landmark,0.921739,3,landmark,landmark
28,25038595_1618907854862436_2293914648082120704_...,building,0.910864,3,building,building
29,25038595_1618907854862436_2293914648082120704_...,night,0.909465,3,night,night


In [1244]:
df_ori['last']=0
df_ori['last'].loc[91138]=1
df_ori

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,filename,object,prob,ind,new_object,final_object,last
0,24175506_169512276977734_5210213780121714688_n...,apparel,0.959756,0,apparel,apparel,0
1,24175506_169512276977734_5210213780121714688_n...,shoes,0.911549,0,shoes,shoes,0
3,24175506_169512276977734_5210213780121714688_n...,shoulder,0.797446,0,shoulder,shoulder,0
10,25036659_1527256927323801_6735676178138398720_...,color,0.949924,1,color,color,0
11,25036659_1527256927323801_6735676178138398720_...,text,0.947646,1,text,text,0
13,25036659_1527256927323801_6735676178138398720_...,font,0.884107,1,font,font,0
20,25037016_253548621851555_3636738028602392576_n...,music artist,0.864889,2,music artist,music artist,0
27,25038595_1618907854862436_2293914648082120704_...,landmark,0.921739,3,landmark,landmark,0
28,25038595_1618907854862436_2293914648082120704_...,building,0.910864,3,building,building,0
29,25038595_1618907854862436_2293914648082120704_...,night,0.909465,3,night,night,0


# Label matrix

In [1234]:
matrix_object=ana[ana.filename>199].reset_index()['object'].tolist()

In [1259]:
def matrix_generate(matrix_list,df):
    df_final=pd.DataFrame()
    for obj in matrix_object:
        column=[]
        for obj_occur in matrix_object:
            num=concur(obj,obj_occur,df)
            column.append(num)
        df_final[obj]=column
    return df_final

csv_print=matrix_generate(matrix_object,df_ori)

        
        
        
    

In [1260]:
csv_print.to_csv('label_matrix.csv')

# Concert

In [1159]:
con=['light','entertainment','stage','performance','event','crowd','lighting','audience','performing arts','singing','performance art','guitarist','music artist','music','musician','disco','nightclub','pop music','drums','festival','drummer','public event','singer']

In [1160]:
conc=[row.ind for row in df_ana.itertuples() if (row.object=='concert')&(row.prob>0.55)]
len(conc)



2109

In [1161]:
file_index=0
word_list=[]
for row in df_ana.itertuples():
    if row.ind!=file_index:
        file_index+=1
    if (row.object in con)&(file_index in conc):
        word_list.append('concert')
    else:
        word_list.append(row.object)

In [1162]:
se3=pd.Series(word_list)
df_ana['new_object']=se3

In [1163]:
new_bucket=['light','entertainment','stage','performance','event','crowd','lighting','auidence','performing arts','performance art']

In [1185]:
new_con=[row.ind for row in df_ana.itertuples() if row.new_object in ['concert','auditorium','singer','singing','music artist','drums','musical theatre','musical instrument','performing arts','performance art']]

In [1186]:
file_index=0
word_list2=[]
for row in df_ana.itertuples():
    if row.ind!=file_index:
        file_index+=1
    if (row.new_object in new_bucket)&(file_index in new_con):
        word_list2.append('concert')
    else:
        word_list2.append(row.new_object)
        

In [1187]:
se4=pd.Series(word_list2)
df_ana['final_object']=se4

In [1188]:
df=df_ana.drop_duplicates(['filename','final_object'])

In [1192]:
len(df[df.prob>0.75])

33839

In [1198]:
final_label=df[df.prob>0.6].groupby('final_object').count().sort_values('filename',ascending=False)
final_label.head(100)

Unnamed: 0_level_0,filename,object,prob,ind,new_object
final_object,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
color,3925,3925,3925,3925,3925
product,2370,2370,2370,2370,2370
shoes,1957,1957,1957,1957,1957
concert,1538,1538,1538,1538,1538
fun,1443,1443,1443,1443,1443
font,1130,1130,1130,1130,1130
darkness,932,932,932,932,932
structure,895,895,895,895,895
photograph,893,893,893,893,893
night,848,848,848,848,848


In [1191]:
final_label.to_csv('label_final.csv')

In [1178]:
pwd

'/Users/gary.zhou/Documents/cloud_vision'

In [1204]:
len(df[df.final_object=='selfie'])

84

In [1194]:
#df[df.ind==2254]

In [1205]:
#df[(df.final_object=='selfie')&(df.prob>.75)]

In [1196]:
#for i in con:
#    print(pd.Series.to_string(re[re['new_object']==i]['index'])[2:])


# Others

In [14]:
keywords=['t shirt','footwear','sportswear','shoe','outdoor shoe','sneakers']
#df[df['object']=='landmark']
#df[df['filename']==46]
        

In [15]:
filenames = [filename for filename in os.listdir('/Users/gary.zhou/desktop/test/Core_images/') if 'DS_Store' not in filename]

In [16]:
n1=1
jpg='.jpg'
for name in filenames:
    os.renames(old=name,new=str(n1)+jpg)
    n1=n1+1

FileNotFoundError: [Errno 2] No such file or directory: '24175506_169512276977734_5210213780121714688_n.jpg' -> '1.jpg'

In [245]:
cd desktop/test/Stores_images/


/Users/gary.zhou/Desktop/test/Stores_images


In [31]:
count=0
num_images=set(df2['filename'])
for i in num_images:
    index_min=df2[df2['filename']==i].index[0]
    index_max=df2[df2['filename']==i].index[-1]
    for ind in range(index_min,index_max):
        if (df2['object'][ind] in keywords and df2['probability'][ind]>0.70):
            count=count+1
            break
    
    
    
    

In [32]:
count

862

In [20]:
result1=1036/3820
print("Core post: {0:.2f}%".format(result1*100))

Core post: 27.12%


In [30]:
result2=count/len(num_images)
print("storiest: {0:.2f}%".format(result2*100))

storiest: 34.66%


In [267]:
df

Unnamed: 0,filename,object,probability
0,1,product,0.834238
1,1,sport venue,0.793123
2,1,structure,0.776179
3,1,product,0.679219
4,1,play,0.661621
5,1,recreation,0.515665
6,1,fast food,0.500111
7,2,red,0.965575
8,2,finger,0.807831
9,2,t shirt,0.714279


In [329]:
import re

In [332]:
word=in_feed_images_selfie['object'].unique()

In [362]:
in_feed_images_selfie


Unnamed: 0,id,object,prob
0,1,t shirt,0.959756
1,1,footwear,0.911549
2,1,sportswear,0.885956
3,1,shoulder,0.797446
4,1,shoe,0.791966
5,1,sleeve,0.765466
6,1,outerwear,0.752519
7,1,product,0.692231
8,1,recreation,0.662025
9,1,material,0.603908


In [344]:
b=' '.join(a)


In [360]:
re.findall('photo',b)

['photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo',
 'photo']

In [363]:
a

['t shirt',
 'footwear',
 'sportswear',
 'shoulder',
 'shoe',
 'sleeve',
 'outerwear',
 'product',
 'recreation',
 'material',
 'black',
 'text',
 'black and white',
 'font',
 'monochrome photography',
 'monochrome',
 'logo',
 'area',
 'night',
 'darkness',
 'music artist',
 'performance',
 'gold medal',
 'competition event',
 'singing',
 'fun',
 'landmark',
 'building',
 'metropolitan area',
 'architecture',
 'metropolis',
 'urban area',
 'city',
 'downtown',
 'lighting',
 'atmosphere',
 'automotive design',
 'computer wallpaper',
 'brand',
 'graphics',
 'advertising',
 'symbol',
 'arm',
 'human',
 'joint',
 'human behavior',
 'photography',
 'space',
 'clothing',
 'child',
 'boy',
 'male',
 'jeans',
 'uniform',
 'toddler',
 'sky',
 'sneakers',
 'personal protective equipment',
 'athletic shoe',
 'outdoor shoe',
 'cool',
 'line',
 'design',
 'pattern',
 'graphic design',
 'infrastructure',
 'road',
 'street',
 'play',
 'human body',
 'art',
 'costume',
 'performing arts',
 'flyer',
 '

In [None]:
ana.head(10)