# Part 1: Data preparation
Julianne Freeman<br>
Insight Data Science Fellow 2018B

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas

Software,Version
Python,3.6.3 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython,5.1.0
OS,Darwin 16.1.0 x86_64 i386 64bit
numpy,1.14.4
scipy,0.18.1
matplotlib,2.0.0
pandas,0.19.2
Sat Jun 30 17:37:37 2018 PDT,Sat Jun 30 17:37:37 2018 PDT


In [1]:
from datetime import datetime 
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import os
from collections import Counter
import random

#never print matching warnings
warnings.filterwarnings('ignore') 

#set to display all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#import seaborn to draw pretty graphs
import seaborn as sns
sns.set_style('whitegrid')

#to load images
from PIL import Image

#download videos
from pytube import YouTube

#train test split
from sklearn.cross_validation import train_test_split

# 1. Prepare training and validation images

### Obtain images

Webscraped images from Google using Firefox extension "Google Image Downloader"

### Compile images

Compile images that were downloaded in saved in folders on desktop. Images have already been hand sorted into categories but need to be compiled into a single location to make training easier and replicable (aka I can have documentation of which images were used or not used in my model.)

In [None]:
def find_imgs(name, savename, savepath):
    
    orig_dir='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_orig/'

    count=0
    dirs=os.listdir()
    for currdir in dirs:
        if currdir != '.DS_Store':
            folders=os.listdir(orig_dir+currdir)
            for folder in folders:
                if name in folder:
                    print(folder)
                    os.chdir(orig_dir+currdir+'/'+folder)
                    files=os.listdir()
                    for file in files:
                        os.chdir(orig_dir+currdir+'/'+folder)
                        if file.split(".")[-1] == "jpg":
                            count+=1
                            im = Image.open(file)
                            if count<10:
                                newfile= savename + '_000' + str(count) + '.jpg'
                            elif count<100 and count>=10:
                                newfile= savename + '_00' + str(count) + '.jpg'
                            elif count<1000 and count>=100:
                                newfile= savename + '_0' + str(count) + '.jpg'
                            elif count>1000:
                                newfile= savename + '_'+ str(count)+'.jpg'
                            os.chdir(savepath)
                            try:
                                im.save(newfile,"JPEG")
                            except:
                                im.convert('RGB').save(newfile,"JPEG")
    return count

In [None]:
name='warrior_2_GOOD'
savename='warrior2_img'
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/imgs/warrior2'

count=find_imgs(name, savename, savepath)
print('\nfound %d imgs' % (count))

In [None]:
name='triangle_pose_GOOD'
savename='trianglepose_img'
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/imgs/trianglepose'

count=find_imgs(name, savename, savepath)
print('\nfound %d imgs' % (count))

In [None]:
name='reverse_warrior_GOOD'
savename='reversewarrior_img'
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/imgs/reversewarrior'

count=find_imgs(name, savename, savepath)
print('\nfound %d imgs' % (count))

In [None]:
name='OTHER'
savename='other_img'
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/imgs/other'

count=find_imgs(name, savename, savepath)
print('\nfound %d imgs' % (count))

### Select 500 random images per catagory

To keep groups of images balanced, I only selected 500 images per category. In this way, each category would contain 1000 images after flipping. The images were flipped after pre-processing in OpenPose to prevent one category to being biased in left- or right-facing images.

In [None]:
def select_images(ipath, opath, num):

    #select random sample
    files=os.listdir(ipath)
    sample_size = num
    sample = random.sample(files, sample_size)
    len(sample)

    for file in sample:
        if file!='.DS_Store':
            os.chdir(ipath)
            im = Image.open(file)
            os.chdir(opath)
            im.save(file,"JPEG")

In [None]:
# other
ipath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs/other'
opath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs_random/other'
select_images(ipath, opath, 100)

In [None]:
# reverse warrior
ipath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs/reversewarrior'
opath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs_random/reversewarrior'
select_images(ipath, opath, 100)

In [None]:
# triangle pose
ipath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs/trianglepose'
opath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs_random/trianglepose'
select_images(ipath, opath, 100)

In [None]:
# warrior2
ipath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs/warrior2'
opath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/imgs_random/warrior2'
select_images(ipath, opath, 100)

### OpenPose

Ran images through OpenPose. Disabled blending as to only extract keypoint skeletons instead of full image. Because images were processed on a CPU, I had to use a static net-resolution.

must run code from the terminal while in the openpose_master directory:

./build/examples/openpose/openpose.bin --image_dir <'load_directory'> --write_images <'save_directory'> --write_images_format jpg --net_resolution 656x368 --disable_blending


### Flip images

The images were flipped to prevent one category to being biased in left- or right-facing images. Flipping occured after OpenPose processing as to limit the number of images that had to be processed in OpenPose and speed up the pre-processing pipeline.

In [None]:
def imgs_flip(file, size, ipath, opath):
    os.chdir(ipath)
    im = Image.open(file)
    newim=im.transpose(Image.FLIP_LEFT_RIGHT)
    os.chdir(opath)
    newfile=os.path.splitext(file)[0]+'_flipped.jpg'
    
    #same new and old image
    im.save(file,"JPEG")
    newim.save(newfile,"JPEG")

In [None]:
ipath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/imgs_random_openpose_noblend'
opath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/imgs_random_openpose_noblend_flipped'

folders=os.listdir(ipath)
for folder in folders:
    
    if folder != '.DS_Store':
        
        print(folder)
        
        origpath=ipath+'/'+folder
        savepath=opath+'/'+folder
        
        #find all imgs
        files = os.listdir(origpath)

        for file in files:
            if file.split(".")[-1] == "jpg":

                #load image
                os.chdir(origpath)
                im = Image.open(file)

                #create new image
                newfile=os.path.splitext(file)[0]+'_flipped.jpg'
                newim=im.transpose(Image.FLIP_LEFT_RIGHT)

                #save new and old image
                os.chdir(savepath)     
                im.save(file,"JPEG")
                newim.save(newfile,"JPEG")

### Split data into train (80) and train (20)

In [None]:
#define paths
ipath     ='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/imgs_random_openpose_noblend_flipped'
train_path='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/model/train'
valid_path='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/model/validation'


folders=os.listdir(ipath)
for folder in folders:
    if folder!='.DS_Store':
        print(folder)
        count=0
        origpath=ipath+'/'+folder
        train_savepath=train_path+'/'+folder
        valid_savepath=valid_path+'/'+folder
        
        #find all imgs
        files = os.listdir(origpath)
        if files[0]=='.DS_Store':
            files=files[1:]
        random.shuffle(files)
        
        x_train ,x_test = train_test_split(files,test_size=0.2)
        len(files)
        len(x_train)
        len(x_test)
        
        for file in x_train:
            os.chdir(origpath)
            im=Image.open(file)
            os.chdir(train_savepath)
            im.save(file,"JPEG")
            
        for file in x_test:
            os.chdir(origpath)
            im=Image.open(file)
            os.chdir(valid_savepath)
            im.save(file,"JPEG")


# 2. Create test set for cross-validation using images collected seperately from YouTube

### Download videos for image directory

In [None]:
from pytube import YouTube
import os

def downloadYouTube(videourl, path):

    yt = YouTube(videourl)
    yt = yt.streams.filter(progressive=True,file_extension='mp4').order_by('resolution').desc().first()
    if not os.path.exists(path):
        os.makedirs(path)
    yt.download(path)

savedir='/Users/juliannefreeman/Documents/Videos'
downloadYouTube('https://www.youtube.com/watch?v=9B2Bpk135iA', savedir)

# Split into frames

In [None]:
import cv2
os.chdir(savedir)
vidcap = cv2.VideoCapture('How To Do Warrior 2 Yoga For Beginners.mp4')
success,image = vidcap.read()
count = 0
success = True
while success:
    success,image = vidcap.read()
    print('read a new frame:',success)
    if count%20 == 0 :
         cv2.imwrite('frame%d.jpg'%count,image)
         print('success')
    count+=1

### Hand organized images from each video into subfolders

### Compile all images into folders

In [20]:
def find_testings(pose, savename, savepath):
    
    path='/Users/juliannefreeman/Documents/Insight/Project/imgs/imgs_pract5/model_accuracy/videos/'
    
    count=0
    dirs=os.listdir(path)
    for currdir in dirs: #video folders
        #print(currdir)
        if currdir != '.DS_Store':
            folders=os.listdir(path+currdir)
            for folder in folders: #pose folders
                if pose in folder:
                    #print(folder)
                    os.chdir(path+currdir+'/'+folder)
                    files=os.listdir() #imgs
                    for file in files:
                        os.chdir(path+currdir+'/'+folder)
                        if file.split(".")[-1] == "jpg":
                            count+=1
                            im = Image.open(file)
                            if count<10:
                                newfile= savename + '_000' + str(count) + '_' + currdir + '.jpg'
                            elif count<100 and count>=10:
                                newfile= savename + '_00' + str(count) + '_' + currdir + '.jpg'
                            elif count<1000 and count>=100:
                                newfile= savename + '_0' + str(count) + '_' + currdir + '.jpg'
                            elif count>1000:
                                newfile= savename + '_'+ str(count)+ '_' + currdir + '.jpg'
                            os.chdir(savepath)
                            try:
                                im.save(newfile,"JPEG")
                            except:
                                im.convert('RGB').save(newfile,"JPEG")
    return count

In [8]:
pose='other'
savename=pose
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final/'+pose+'/'

find_testings(pose, savename, savepath)

2227

In [19]:
pose='reversewarrior'
savename=pose
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final/'+pose+'/'

find_testings(pose, savename, savepath)

49

In [10]:
pose='trianglepose'
savename=pose
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final/'+pose+'/'

find_testings(pose, savename, savepath)

140

In [11]:
pose='warrior2'
savename=pose
savepath='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final/'+pose+'/'

find_testings(pose, savename, savepath)

163

### flip images
I decided to flip images BEFORE running open pose as I have less than 100 images in some categorie and I need to flip the images in order to augment the sample size.

In [21]:
def imgs_flip(file, size, ipath, opath):
    os.chdir(ipath)
    im = Image.open(file)
    newim=im.transpose(Image.FLIP_LEFT_RIGHT)
    os.chdir(opath)
    newfile=os.path.splitext(file)[0]+'_flipped.jpg'
    
    #same new and old image
    im.save(file,"JPEG")
    newim.save(newfile,"JPEG")

In [22]:
ipath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final/'
opath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip/'
size=(800,600)

folders=os.listdir(ipath)

for folder in folders: #poses
    
    if folder != '.DS_Store':
        
        print(folder)
        origpath=ipath+'/'+folder
        savepath=opath+'/'+folder
        
        #find all imgs
        files = os.listdir(origpath)

        for file in files:
            if file.split(".")[-1] == "jpg":

                #load image
                os.chdir(origpath)
                im = Image.open(file)

                #create new image
                newfile=os.path.splitext(file)[0]+'_flipped.jpg'
                newim=im.transpose(Image.FLIP_LEFT_RIGHT)

                #save new and old image
                os.chdir(savepath)     
                im.save(file,"JPEG")
                newim.save(newfile,"JPEG")

other
reversewarrior
trianglepose
warrior2


### sample 100 images per category

In [14]:
def random_sample(ipath, opath, sample_size):
    #select random sample
    files=os.listdir(ipath)
    sample = random.sample(files, sample_size)
    len(sample)

    for file in sample:
        os.chdir(ipath)
        im = Image.open(file)
        os.chdir(opath)
        im.save(file,"JPEG")

In [27]:
pose='other'
sample_size=100

ipath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip/'+pose
opath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip_sample/'+pose

random_sample(ipath, opath, sample_size)

In [28]:
pose='warrior2'
sample_size=100

ipath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip/'+pose
opath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip_sample/'+pose

random_sample(ipath, opath, sample_size)

In [29]:
pose='reversewarrior'
sample_size=100

ipath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip/'+pose
opath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip_sample/'+pose

random_sample(ipath, opath, sample_size)

In [31]:
pose='trianglepose'
sample_size=100

ipath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip/'+pose
opath ='/Users/juliannefreeman/Documents/Insight/Project/imgs/final/model_accuracy/final_flip_sample/'+pose

random_sample(ipath, opath, sample_size)

### OpenPose

must run code from the terminal while in the openpose_master directory:

./build/examples/openpose/openpose.bin --image_dir <'load_directory'> --write_images <'save_directory'> --write_images_format jpg --net_resolution 656x368 --disable_blending
