#RxId : CSV Data Cleaning

Data Source : https://pillbox.nlm.nih.gov/developers.html
https://dev.socrata.com/foundry/datadiscovery.nlm.nih.gov/crzr-uvwg


Issue :  Two CSV files were downloaded from the above site.  

    Pillbox.NO.ID.csv  has useful meds data but no image_id  field to link it to a filename in the image library.

    Pillbox.NO.IMAGE ID.csv  lacks useful meds data but HAS image_id  field to link it to a filename in the image library.

Solution:   Clean CSV files individually and merge into single CSV that will be used to load an AWS RDS database


### Load CSVs into dataframes

In [5]:
import pandas as pd
pd.options.display.max_columns = None

In [2]:
url1="https://raw.githubusercontent.com/labs12-rxid/DS/master/CSV/Pillbox.NO.ID.csv"
df1=pd.read_csv(url1)

url2="https://raw.githubusercontent.com/labs12-rxid/DS/master/CSV/Pillbox.IMAGE%20ID.csv"
df2=pd.read_csv(url2)


print(df1.shape)
df2.shape

  interactivity=interactivity, compiler=compiler, result=result)


(62898, 72)


(62898, 24)

In [3]:
df1.columns

Index(['ID', 'Enabled?', 'created at', 'updated at', 'spp', 'setid', 'splsize',
       'pillbox_size', 'splshape', 'splshape_text', 'pillbox_shape_text',
       'splscore', 'pillbox_score', 'splimprint', 'pillbox_imprint',
       'splcolor', 'splcolor_text', 'pillbox_color_text', 'spl_strength',
       'spl_ingredients', 'spl_inactive_ing', 'source', 'rxtty', 'rxstring',
       'rxcui', 'product_code', 'part_num', 'part_medicine_name', 'ndc9',
       'ndc_labeler_code', 'ndc_product_code', 'medicine_name',
       'marketing_act_code', 'effective_time', 'file_name',
       'equal_product_code', 'dosage_form', 'document_type',
       'dea_schedule_code', 'dea_schedule_name', 'author_type', 'author',
       'approval_code', 'image_source', 'splimage', 'has_image', 'epc_match',
       'version_number', 'laberer_code', 'application_number', 'updated',
       'stale', 'new', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55',
       'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59',
      

###  Rename df2.id to df2.ID  to match df1

In [4]:
df2.rename(columns={'id':'ID'}, inplace=True)
df2.columns

Index(['ID', 'spl_id', 'SETID', 'spp', 'NDC9', 'PRODUCT_CODE',
       'EQUAL_PRODUCT_CODE', 'author', 'SPLIMPRINT', 'SPLCOLOR', 'SPLSHAPE',
       'SPLSIZE', 'SPLSCORE', 'DEA_SCHEDULE_CODE', 'INGREDIENTS',
       'SPL_INACTIVE_ING', 'RXCUI', 'RXTTY', 'RXSTRING', 'image_id',
       'IMAGE_SOURCE', 'HAS_IMAGE', 'FROM_SIS', 'NO_RXCUI'],
      dtype='object')

### Drop Useless/Duplicated Columns from df1 & df2

In [5]:
drop_col_1=['created at', 'updated at', 'ndc9', 'author',
            'rxstring', 'has_image',
       'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55',
       'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59',
       'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63',
       'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67',
       'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71']
df1.drop(columns=drop_col_1, inplace=True)
df1.columns

Index(['ID', 'Enabled?', 'spp', 'setid', 'splsize', 'pillbox_size', 'splshape',
       'splshape_text', 'pillbox_shape_text', 'splscore', 'pillbox_score',
       'splimprint', 'pillbox_imprint', 'splcolor', 'splcolor_text',
       'pillbox_color_text', 'spl_strength', 'spl_ingredients',
       'spl_inactive_ing', 'source', 'rxtty', 'rxcui', 'product_code',
       'part_num', 'part_medicine_name', 'ndc_labeler_code',
       'ndc_product_code', 'medicine_name', 'marketing_act_code',
       'effective_time', 'file_name', 'equal_product_code', 'dosage_form',
       'document_type', 'dea_schedule_code', 'dea_schedule_name',
       'author_type', 'approval_code', 'image_source', 'splimage', 'epc_match',
       'version_number', 'laberer_code', 'application_number', 'updated',
       'stale', 'new'],
      dtype='object')

In [6]:
drop_col_2=['SETID', 'spp', 'INGREDIENTS','SPL_INACTIVE_ING','SPLSIZE', 'SPLSCORE',
           'SPLIMPRINT', 'SPLCOLOR',	'SPLSHAPE', 'RXCUI',	'RXTTY', 'IMAGE_SOURCE']
df2.drop(columns=drop_col_2, inplace=True)
df2.columns

Index(['ID', 'spl_id', 'NDC9', 'PRODUCT_CODE', 'EQUAL_PRODUCT_CODE', 'author',
       'DEA_SCHEDULE_CODE', 'RXSTRING', 'image_id', 'HAS_IMAGE', 'FROM_SIS',
       'NO_RXCUI'],
      dtype='object')

### compare same  ID acrross Frames

In [105]:
df1.query('ID==3143')

Unnamed: 0,ID,Enabled?,spp,setid,splsize,pillbox_size,splshape,splshape_text,pillbox_shape_text,splscore,pillbox_score,splimprint,pillbox_imprint,splcolor,splcolor_text,pillbox_color_text,spl_strength,spl_ingredients,spl_inactive_ing,source,rxtty,rxcui,product_code,part_num,part_medicine_name,ndc_labeler_code,ndc_product_code,medicine_name,marketing_act_code,effective_time,file_name,equal_product_code,dosage_form,document_type,dea_schedule_code,dea_schedule_name,author_type,approval_code,image_source,splimage,epc_match,version_number,laberer_code,application_number,updated,stale,new
25440,3143,True,e4682a6e-9624-48b0-978f-95da00cfb78f-0093-7372-0,e4682a6e-9624-48b0-978f-95da00cfb78f,19.0,,C48336,CAPSULE,,1.0,,TEVA;7372,,C48328;C48325,PINK;WHITE,,AMLODIPINE BESYLATE 5 mg;BENAZEPRIL HYDROCHLOR...,AMLODIPINE BESYLATE[AMLODIPINE];BENAZEPRIL HYD...,FERROSOFERRIC OXIDE;ANHYDROUS DIBASIC CALCIUM ...,CORN;CROSPOVIDONE (15 MPA.S AT 5%);GELATIN,UNSPECIFIED;LACTOSE MONOHYDRATE;MAGNESIUM STE...,SCD,Amlodipine 5 MG / Benazepril hydrochloride 20 ...,898356,0093-7372,,937372,93,7372,Amlodipine and Benazepril Hydrochloride,active,20170823,a804a97e-d1e0-411b-b075-557ac09684c0.xml,,C25158,34391-3,,LABELER,Teva Pharmaceuticals USA,Inc.,SPL,93737201,True,,16,,


In [106]:
df2.query('ID==3143')

Unnamed: 0,ID,spl_id,NDC9,PRODUCT_CODE,EQUAL_PRODUCT_CODE,author,DEA_SCHEDULE_CODE,RXSTRING,image_id,HAS_IMAGE,FROM_SIS,NO_RXCUI
62894,3143,,937372,0093-7372,,"Teva Pharmaceuticals USA, Inc.",,Amlodipine 5 MG / Benazepril hydrochloride 20 ...,93737201,1,,


###  Combine dataframes

In [0]:
df_comb=pd.merge(df1,df2,how='left', on=['ID'])

In [111]:
df_comb.query('ID==3143')

Unnamed: 0,ID,Enabled?,spp,setid,splsize,pillbox_size,splshape,splshape_text,pillbox_shape_text,splscore,pillbox_score,splimprint,pillbox_imprint,splcolor,splcolor_text,pillbox_color_text,spl_strength,spl_ingredients,spl_inactive_ing,source,rxtty,rxcui,product_code,part_num,part_medicine_name,ndc_labeler_code,ndc_product_code,medicine_name,marketing_act_code,effective_time,file_name,equal_product_code,dosage_form,document_type,dea_schedule_code,dea_schedule_name,author_type,approval_code,image_source,splimage,epc_match,version_number,laberer_code,application_number,updated,stale,new,spl_id,NDC9,PRODUCT_CODE,EQUAL_PRODUCT_CODE,author,DEA_SCHEDULE_CODE,RXSTRING,image_id,HAS_IMAGE,FROM_SIS,NO_RXCUI
25440,3143,True,e4682a6e-9624-48b0-978f-95da00cfb78f-0093-7372-0,e4682a6e-9624-48b0-978f-95da00cfb78f,19.0,,C48336,CAPSULE,,1.0,,TEVA;7372,,C48328;C48325,PINK;WHITE,,AMLODIPINE BESYLATE 5 mg;BENAZEPRIL HYDROCHLOR...,AMLODIPINE BESYLATE[AMLODIPINE];BENAZEPRIL HYD...,FERROSOFERRIC OXIDE;ANHYDROUS DIBASIC CALCIUM ...,CORN;CROSPOVIDONE (15 MPA.S AT 5%);GELATIN,UNSPECIFIED;LACTOSE MONOHYDRATE;MAGNESIUM STE...,SCD,Amlodipine 5 MG / Benazepril hydrochloride 20 ...,898356,0093-7372,,937372,93,7372,Amlodipine and Benazepril Hydrochloride,active,20170823,a804a97e-d1e0-411b-b075-557ac09684c0.xml,,C25158,34391-3,,LABELER,Teva Pharmaceuticals USA,Inc.,SPL,93737201,True,,16,,,,937372,0093-7372,,"Teva Pharmaceuticals USA, Inc.",,Amlodipine 5 MG / Benazepril hydrochloride 20 ...,93737201,1,,


### Write out combined CSV

In [0]:
# df_comb.to_csv('Pills.Final.csv', index=False ) #header=['id','status_group'])

# USING FINAL CSV FILE

In [2]:
df = pd.read_csv('Pills.Final.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ID,Enabled?,spp,setid,splsize,pillbox_size,splshape,splshape_text,pillbox_shape_text,splscore,pillbox_score,splimprint,pillbox_imprint,splcolor,splcolor_text,pillbox_color_text,spl_strength,spl_ingredients,spl_inactive_ing,source,rxtty,rxcui,product_code,part_num,part_medicine_name,ndc_labeler_code,ndc_product_code,medicine_name,marketing_act_code,effective_time,file_name,equal_product_code,dosage_form,document_type,dea_schedule_code,dea_schedule_name,author_type,approval_code,image_source,splimage,epc_match,version_number,laberer_code,application_number,spl_id,NDC9,PRODUCT_CODE,EQUAL_PRODUCT_CODE,author,DEA_SCHEDULE_CODE,RXSTRING,image_id,HAS_IMAGE,FROM_SIS,NO_RXCUI
0,20466,True,9e8e429c-1960-4acd-8bc5-5002d4c6a6ac-21695-995-2,9e8e429c-1960-4acd-8bc5-5002d4c6a6ac,3.0,,C48348,ROUND,,1.0,,dp;519,,C48329,GREEN,,,,D&C YELLOW NO. 10;FD&C BLUE NO. 1;FD&C YELLOW ...,MICROCRYSTALLINE;STARCH,CORN;,BPCK,{21 (Ethinyl Estradiol 0.02 MG / Levonorgestre...,750268,21695-995,Inert,216950995.0,21695,995,Aviane,active,20111004,9e8e429c-1960-4acd-8bc5-5002d4c6a6ac.xml,0555-9045,C42998,34391-3,,LABELER,Rebel Distributors Corp,C73584,,False,1,1.0,,216950995,21695-995,0555-9045,Rebel Distributors Corp,,{21 (Ethinyl Estradiol 0.02 MG / Levonorgestre...,,0,1.0,
1,42363,True,782c1bb9-75c4-4258-97b6-fd776ca970f5-55154-3432-0,782c1bb9-75c4-4258-97b6-fd776ca970f5,15.0,,C48345,OVAL,,1.0,,GGD6,,C48325,WHITE,,AZITHROMYCIN MONOHYDRATE 250 mg;,AZITHROMYCIN MONOHYDRATE[AZITHROMYCIN ANHYDROUS];,SILICON DIOXIDE;LECITHIN,SOYBEAN;MAGNESIUM STEARATE;MICROCRYSTALLINE C...,UNSPECIFIED;STARCH,HRX,SCD,Azithromycin 250 MG Oral Tablet,308460,0,,551543432,55154,3432,Azithromycin,active,20180110,b0acdcad-0d6d-4edc-a31f-7b1e3ab65ffc.xml,0781-1496,C42931,34391-3,,LABELER,Cardinal Health,,,FALSE,1.0,,551543432,55154-3432,0781-1496,Cardinal Health,,Azithromycin 250 MG Oral Tablet,,0,1.0,
2,48946,True,a39bb616-9e84-4142-8450-824f88528906-51285-120-2,a39bb616-9e84-4142-8450-824f88528906,6.0,,C48348,ROUND,,1.0,,dp;331,,C48329,GREEN,,,,D&C YELLOW NO. 10;FD&C BLUE NO. 1;FD&C YELLOW ...,CORN;,HRX,{21 (Desogestrel 0.15 MG / Ethinyl Estradiol 0...,762002,51285-120,2,512850120,51285.0,120,Mircette,active,20170519,c4c9e9dc-7e2a-459a-93e5-e3347fb9b1ed.xml,,C42998,,,,Teva Women's Health,Inc.,C73584,,False,,3.0,,512850120,51285-120,,"Teva Women's Health, Inc.",,{21 (Desogestrel 0.15 MG / Ethinyl Estradiol 0...,,0,,
3,43308,True,d08cc0ab-4cb5-4290-8d46-5d8b66e8472e-65084-453-0,d08cc0ab-4cb5-4290-8d46-5d8b66e8472e,14.0,,C48345,OVAL,,2.0,,NORCO;539,,C48325,WHITE,,HYDROCODONE BITARTRATE 10 mg;ACETAMINOPHEN 325...,HYDROCODONE BITARTRATE[HYDROCODONE];ACETAMINOP...,CROSCARMELLOSE SODIUM;CROSPOVIDONE (15 MPA.S A...,UNSPECIFIED;STARCH,CORN;STEARIC ACID;,SCD,Acetaminophen 325 MG / Hydrocodone Bitartrate ...,856999,65084-453,,650840453.0,65084,453,Hydrocodone Bitatrate and Acetaminophen,active,20170614,1aee5ab4-6f3d-44f4-952b-0a2aa7915679.xml,52544-161,C42998,34391-3,,LABELER,Mckesson Rxpak Inc,C73584,,False,1,3.0,,650840453,65084-453,52544-161,Mckesson Rxpak Inc,,Acetaminophen 325 MG / Hydrocodone Bitartrate ...,,0,1.0,
4,18502,True,fa1ec93e-35b0-43c9-81ee-69f7e8abea87-65841-748-0,fa1ec93e-35b0-43c9-81ee-69f7e8abea87,19.0,,C48336,CAPSULE,,1.0,,ZA;60;160;mg,,C48333;C48333,BLUE;BLUE,,PROPRANOLOL HYDROCHLORIDE 160 mg;,PROPRANOLOL HYDROCHLORIDE[PROPRANOLOL];,ETHYLCELLULOSES;FD&C BLUE NO. 1;FERROSOFERRIC ...,MICROCRYSTALLINE;,HRX,24 HR Propranolol Hydrochloride 160 MG Extende...,856481,65841-748,0,658410748,65841.0,748,PROPRANOLOL HYDROCHLORIDE,active,20140430,e55102eb-608c-481d-af93-1caf96b37013.xml,,C42916,,,,Cadila Healthcare Limited,C73584,,False,,1,,,658410748,65841-748,,Cadila Healthcare Limited,,24 HR Propranolol Hydrochloride 160 MG Extende...,,0,,


In [49]:
# DATAFRAME with 'image_id' of ROUND observations
df_round = df['image_id'][df['splshape'] == 'C48348'][df['HAS_IMAGE'] == 1]
df_round.shape

(4132,)

In [108]:
# DATAFRAME with 'image_id' of OVAL observations
df_oval = df['image_id'][df['splshape'] == 'C48345'][df['HAS_IMAGE'] == 1]
df_oval.shape

(2320,)

In [50]:
# DATAFRAME with 'image_id' of CAPSULE observations
df_capsule = df['image_id'][df['splshape'] == 'C48336'][df['HAS_IMAGE'] == 1]
df_capsule.shape

(1958,)

In [51]:
# Quick test on CAPSULE files
df_captest = df[['image_id', 'splshape', 'splshape_text']][df['splshape'] == 'C48336'][df['HAS_IMAGE'] == 1]

  


In [52]:
df_captest.shape

(1958, 3)

In [45]:
df_captest['splshape_text'].value_counts()

CAPSULE    1958
Name: splshape_text, dtype: int64

In [109]:
# Quick test on OVAL files
df_ovaltest = df[['image_id', 'splshape', 'splshape_text']][df['splshape'] == 'C48345'][df['HAS_IMAGE'] == 1]

  """Entry point for launching an IPython kernel.


In [110]:
df_ovaltest.shape

(2320, 3)

In [111]:
df_ovaltest['splshape_text'].value_counts()

OVAL    2320
Name: splshape_text, dtype: int64

In [54]:
len(df_capsule)

1958

### Copying ROUND image files to 'round_images' folder

In [37]:
# import os
# import shutil
# src = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\pillbox_images'
# dest = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\round_images'
# src_files = os.listdir(src)
# for file_name in df_round:
#     full_file_name = os.path.join(src, file_name + '.jpg')
#     if (os.path.isfile(full_file_name)):
#         shutil.copy(full_file_name, dest)

In [32]:
pwd

'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\CSV'

### Copying OVAL image files to 'oval_images' folder

In [123]:
# import os
# import shutil
# src = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\pillbox_images'
# dest = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\oval_images'
# src_files = os.listdir(src)
# for file_name in df_oval:
#     full_file_name = os.path.join(src, file_name + '.jpg')
#     if (os.path.isfile(full_file_name)):
#         shutil.copy(full_file_name, dest)

### Copying CAPSULE image files to 'capsule_images' folder

In [65]:
# import os
# import shutil
# src = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\pillbox_images'
# dest = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\capsule_images'
# src_files = os.listdir(src)
# for file_name in df_capsule:
#     full_file_name = os.path.join(src, file_name + '.jpg')
#     if (os.path.isfile(full_file_name)):
#         shutil.copy(full_file_name, dest)

# GETTING IMAGES INTO `TRAIN` & `TEST` FOLDER

### Moving CAPSULE images to Training & Test folders

In [None]:
# import os
# import shutil
# src = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\capsule_images'
# dest = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\train\\capsule'
# src_files = os.listdir(src)
# ctr = 1

# for file_name in df_capsule:
#     if ctr <= 1025:
#         full_file_name = os.path.join(src, file_name + '.jpg')
#         if (os.path.isfile(full_file_name)):
#             shutil.move(full_file_name, dest)
#             ctr += 1

In [91]:
df_oval.head()

32                             675440060
92                00093-4740-01_F02D781B
132    00093-7569-56_NLMIMAGE10_2F4397AC
182                            003780734
207                            584680124
Name: image_id, dtype: object

### Moving CAPSULE images to Test folder

In [124]:
def img_traintest(shape, df):
    """
    Function to get images into Train and Test folders. 
    1025 for Train and 416 for Test.
    
    shape: Takes in a string for the shape.
    df: Takes in Pandas DataFrame/Series with images file names only.
    
    """

    import os
    import shutil
    src = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\' + shape + '_images'
    dest1 = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\train\\' + shape
    dest2 = 'C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\test\\' + shape
    src_files = os.listdir(src)
    ctr1 = 1
    ctr2 = 1
#     print(src)
#     print(dest1)
#     print(dest2)
#     print(f'df = {df.shape}')
    
    for file_name in df:
#         print(f'file_name:\n{file_name}')
        if ctr1 <= 1025:
            full_file_name = os.path.join(src, file_name + '.jpg')
            # print(f'full_file_name:\n{full_file_name}')
            if (os.path.isfile(full_file_name)):
                shutil.move(full_file_name, dest1)
                ctr1 += 1
                
        if ctr2 <= 416:
            full_file_name = os.path.join(src, file_name + '.jpg')
            if (os.path.isfile(full_file_name)):
                shutil.move(full_file_name, dest2)
                ctr2 += 1

In [69]:
# img_traintest("capsule", df_capsule)

In [81]:
# img_traintest("round", df_round)

In [125]:
# img_traintest('oval', df_oval)

# Splitting images in half vertically

#### ROUND "Train" Folder

In [148]:
def image_splitter(shape, train_or_test):
    '''
    Function to split images in train or test folder according based on shape.
    Function takes two strings, one for shape (i.e., round, capsule, oval) and another for either "train" or "test".
    '''
    
    import os
    import imageio
    
    # partial path for creating new directory and saving files
    partial_path = "C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\"
    
    # making new folder for image halves
    new_folder = partial_path + train_or_test + "\\" + shape + "_h\\"

    os.makedirs(new_folder)
    
    # path to loop through
    folder_path = partial_path + train_or_test + "\\" + shape
    # print(folder_path)
    
    # image counter for image halves maded and saved
    img_ctr = 0
    
    # looping through all images in folder and processing
    for file in os.listdir(folder_path):
        file_name = os.path.join(folder_path, file)
        # print(file_name)
        img = imageio.imread(file_name)
        height, width = img.shape[:2]

        # Cut the image in half
        width_cutoff = width // 2
        s1 = img[:, :width_cutoff]
        s2 = img[:, width_cutoff:]
        
        # Save each half
        filepath_h1 = new_folder + "h1_" + file
        filepath_h2 = new_folder + "h2_" + file
        
        imageio.imwrite(filepath_h1, s1)
        imageio.imwrite(filepath_h2, s2)
        img_ctr +=2
        
    print('Total images (halves) saved:', img_ctr)

In [143]:
# image_splitter('round', 'train')

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\000024117.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00006-0221-31_DA15ED0F.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00007-4641-13_FD18FEA7.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\000084188.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\000090032.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\000090114.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\000090121.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\000094544.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\trai

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-0834-01_NLMIMAGE10_2D4296C4.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-1023-01_NLMIMAGE10_EB35F58F.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-2047-56_EA29752B.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-2048-56_632B3189.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-2063-01_NLMIMAGE10_6F38B795.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-2203-01_CE27E77F.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-2204-01_8426C266.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00093-2931-01_4A2B2559.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00169-0081-81_CD1C66E3.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00169-0084-81_F11C78E3.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00172-2089-60_9C26CE66.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00172-2662-46_592DAC9D.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00172-2908-60_SPLIMAGE30_BC26DE56.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00172-4097-60.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00172-5728-60_SPLIMAGE30_9E27CF4E.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00172375760.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-3495-01_15238AEC.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-3547-52_NLMIMAGE10_8046C026.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-3850-01_NLMIMAGE10_D53BEAFF.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-4024-01_FE157F5B.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-4595-77_NLMIMAGE10_1E450F38.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-5123-01_D223E93F.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-5124-01_3C219E7C.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00378-6043-28_NLMIMAGE10_3E419F4C.jpg
C:\Users\Gutierrez\Documents\DataScience\lam

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\004300171.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00456-0459-01_760EBB35.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\004561525.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\004563330.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00527-1341-10_3D081EA0.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00527-1349-10_53082981.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\00527-1537-30_2215915C.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005271313.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005271318.jpg
C:\Users\Gutierrez\Documents\DataScienc

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005915522.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005915540.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005915543.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005915553.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005915557.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005915660.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005915884.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005970190.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\005970191.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\0060

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009041086.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009041258.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009042015.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009045068.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009045643.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009045758.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009045850.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009045891.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\009045892.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\0090

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\2bfe798f-7e4c-62fa-e054-00144ff8d46c.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\2e92d053-1618-143b-e054-00144ff8d46c.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\31722-0209-30_670EB385.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\31722-0256-30_NLMIMAGE10_1B478DDC.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\31722-0278-10_241D925C.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\31722-0519-01_8C07C65E.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\31722-0520-10_9007C80E.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\317220200.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\433530911.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\433530912.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\433530913.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\433530914.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\43386-0161-06_NLMIMAGE10_0B3C05C0.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\433860330.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\43547-0248-50_10050818.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\43547-0272-10_NLMIMAGE10_5941ACCD.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\43547-0336-10_NLMIMAGE10_F139F8FF.jpg
C:\Use

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790474.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790565.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790575.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790684.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790697.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790734.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790811.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790847.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\510790865.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\5107

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\55253-0600-30_NLMIMAGE10_C635637B.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\555ce88a-3195-3480-e054-00144ff88e88.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\55b2af63-818f-5c82-e054-00144ff88e88.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\57664-0104-88_0B0405B0.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\57664-0397-51_CB08E5C7.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\57664-0477-58_8405C24E.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\576640107.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\576640275.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\65162-0361-10_0823842C.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\65162-0512-10_151C8AE4.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\65162-0514-50_BB1D5DAA.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\65162-0521-11_NLMIMAGE10_0A3E8554.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\65162-0745-10_NLMIMAGE10_0841847C.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\651620188.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\651620554.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\651620571.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\651620572.jpg
C:\U

C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\762820302.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\762820334.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\762820417.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\762820418.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\762820422.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\76439-0141-11_NLMIMAGE10_8440C236.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\764390217.jpg
C:\Users\Gutierrez\Documents\DataScience\lambda_school\labs12-rxid\DS\data\train\round\764390309.jpg


In [None]:
# image_splitter('round', 'test')

# Splitting images in half horizontally

In [1]:
def image_hsplitter(shape, train_or_test):
    '''
    Function to split images in train or test folder according based on shape.
    Function takes two strings, one for shape (i.e., round, capsule, oval) and another for either "train" or "test".
    '''
    
    import os
    import imageio
    
    # partial path for creating new directory and saving files
    partial_path = "C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\"
    
    # making new folder for image halves
    new_folder = partial_path + train_or_test + "\\" + shape + "_h\\"

    os.makedirs(new_folder)
    
    # path to loop through
    folder_path = partial_path + train_or_test + "\\" + shape
    # print(folder_path)
    
    # image counter for image halves maded and saved
    img_ctr = 0
    
    # looping through all images in folder and processing
    for file in os.listdir(folder_path):
        file_name = os.path.join(folder_path, file)
        # print(file_name)
        img = imageio.imread(file_name)
        height, width = img.shape[:2]

        # Cut the image in half horizontally
        height_cutoff = height // 2
        s1 = img[:height_cutoff, :]
        s2 = img[height_cutoff:, :]
        
        # Save each half
        filepath_h1 = new_folder + "h1_" + file
        filepath_h2 = new_folder + "h2_" + file
        
        imageio.imwrite(filepath_h1, s1)
        imageio.imwrite(filepath_h2, s2)
        img_ctr +=2
        
    print('Total images (halves) saved:', img_ctr)

#### CAPSULE "Train" Folder

In [8]:
image_hsplitter('capsule', 'train')

Total images (halves) saved: 1536


#### CAPSULE "Test" Folder

In [9]:
image_hsplitter('capsule', 'test')

Total images (halves) saved: 612


# Renaming files for Neural Network

In [2]:
# Need images labeled like "cat.1" or "dog.3" and so on

# You could then use it in your example like this:
# rename(r'c:\temp\xx', r'*.doc', r'new(%s)')
# The above example will convert all *.doc files in c:\temp\xx dir to new(%s).doc, 
# where %s is the previous base name of the file (without extension).

import glob, os

def rename_files(dir, pattern, titlePattern):
    ctr = 1
    for pathAndFilename in glob.iglob(os.path.join(dir, pattern)):
        title, ext = os.path.splitext(os.path.basename(pathAndFilename))
        os.rename(pathAndFilename, 
                  os.path.join(dir, titlePattern % ctr + ext))
        ctr += 1

#### Train images

In [20]:
# rename_files(r"C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\train\\round_h",
#             r"*.jpg",
#             r"rnd.%s")

In [21]:
# rename_files(r"C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\train\\capsule_h",
#             r"*.jpg",
#             r"cap.%s")

#### Test images

In [3]:
# rename_files(r"C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\test\\round_h",
#             r"*.jpg",
#             r"rnd.%s")

In [4]:
# rename_files(r"C:\\Users\\Gutierrez\\Documents\\DataScience\\lambda_school\\labs12-rxid\\DS\\data\\test\\capsule_h",
#             r"*.jpg",
#             r"cap.%s")

# CHECKING FOR `COLOR` & `SHAPE`

In [14]:
df[df['splcolor_text'] == 'C48329']

Unnamed: 0,ID,Enabled?,spp,setid,splsize,pillbox_size,splshape,splshape_text,pillbox_shape_text,splscore,pillbox_score,splimprint,pillbox_imprint,splcolor,splcolor_text,pillbox_color_text,spl_strength,spl_ingredients,spl_inactive_ing,source,rxtty,rxcui,product_code,part_num,part_medicine_name,ndc_labeler_code,ndc_product_code,medicine_name,marketing_act_code,effective_time,file_name,equal_product_code,dosage_form,document_type,dea_schedule_code,dea_schedule_name,author_type,approval_code,image_source,splimage,epc_match,version_number,laberer_code,application_number,spl_id,NDC9,PRODUCT_CODE,EQUAL_PRODUCT_CODE,author,DEA_SCHEDULE_CODE,RXSTRING,image_id,HAS_IMAGE,FROM_SIS,NO_RXCUI
46503,48054,True,6ae13cb4-0316-40d1-9216-c7d5556aaed3-10631-490-0,6ae13cb4-0316-40d1-9216-c7d5556aaed3,18.0,,C48345,OVAL,,1.0,,pp,ST,,C48329,GREEN,,,,,HRX,Verapamil hydrochloride 240 MG Extended Releas...,897714,10631-490,0,106310490,10631,490,Isoptin SR,,20080311,6ae13cb4-0316-40d1-9216-c7d5556aaed3.xml,,C42897,34391-3,,,Ranbaxy Laboratories Inc.,,,False,,1,,,106310490,10631-490,,Ranbaxy Laboratories Inc.,,Verapamil hydrochloride 240 MG Extended Releas...,,0,,


In [32]:
df[df['product_code'] == '897714']

Unnamed: 0,ID,Enabled?,spp,setid,splsize,pillbox_size,splshape,splshape_text,pillbox_shape_text,splscore,pillbox_score,splimprint,pillbox_imprint,splcolor,splcolor_text,pillbox_color_text,spl_strength,spl_ingredients,spl_inactive_ing,source,rxtty,rxcui,product_code,part_num,part_medicine_name,ndc_labeler_code,ndc_product_code,medicine_name,marketing_act_code,effective_time,file_name,equal_product_code,dosage_form,document_type,dea_schedule_code,dea_schedule_name,author_type,approval_code,image_source,splimage,epc_match,version_number,laberer_code,application_number,spl_id,NDC9,PRODUCT_CODE,EQUAL_PRODUCT_CODE,author,DEA_SCHEDULE_CODE,RXSTRING,image_id,HAS_IMAGE,FROM_SIS,NO_RXCUI
46503,48054,True,6ae13cb4-0316-40d1-9216-c7d5556aaed3-10631-490-0,6ae13cb4-0316-40d1-9216-c7d5556aaed3,18.0,,C48345,OVAL,,1.0,,pp,ST,,C48329,GREEN,,,,,HRX,Verapamil hydrochloride 240 MG Extended Releas...,897714,10631-490,0,106310490,10631,490,Isoptin SR,,20080311,6ae13cb4-0316-40d1-9216-c7d5556aaed3.xml,,C42897,34391-3,,,Ranbaxy Laboratories Inc.,,,False,,1,,,106310490,10631-490,,Ranbaxy Laboratories Inc.,,Verapamil hydrochloride 240 MG Extended Releas...,,0,,


In [27]:
df[['spl_ingredients','splimprint', 'splcolor_text', 'splcolor', 'splshape_text', 'splshape', 'rxcui']][df['rxcui'].str.contains('Verapamil', na=False)]

Unnamed: 0,spl_ingredients,splimprint,splcolor_text,splcolor,splshape_text,splshape,rxcui
1279,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,M;411,BLUE,C48333,OVAL,C48345,Verapamil hydrochloride 240 MG Extended Releas...
2404,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,292,BROWN,C48332,OVAL,C48345,Verapamil hydrochloride 120 MG Extended Releas...
2469,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,G74,BROWN,C48332,OVAL,C48345,Verapamil hydrochloride 240 MG Extended Releas...
3654,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,7300,WHITE,C48325,OVAL,C48345,Verapamil hydrochloride 240 MG Extended Releas...
4653,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,117,YELLOW,C48330,OVAL,C48345,Verapamil hydrochloride 180 MG Extended Releas...
5643,verapamil hydrochloride[verapamil];,SCHWARZ;2489;VERELAN;180;mg,GRAY;YELLOW,C48324;C48330,CAPSULE,C48336,24 HR Verapamil hydrochloride 180 MG Extended ...
5710,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,HP;27,WHITE,C48325,ROUND,C48348,Verapamil hydrochloride 120 MG Oral Tablet
6482,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,HP;59,WHITE,C48325,ROUND,C48348,Verapamil hydrochloride 40 MG Oral Tablet
6651,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,P;SC,PURPLE,C48327,OVAL,C48345,Verapamil hydrochloride 120 MG Extended Releas...
6732,VERAPAMIL HYDROCHLORIDE[VERAPAMIL];,HP;59,WHITE,C48325,ROUND,C48348,Verapamil hydrochloride 40 MG Oral Tablet


In [30]:
df[['splcolor_text', 'splcolor']][df['splcolor']=='C48329']

Unnamed: 0,splcolor_text,splcolor
0,GREEN,C48329
2,GREEN,C48329
8,GREEN,C48329
9,GREEN,C48329
16,GREEN,C48329
54,GREEN,C48329
70,GREEN,C48329
73,GREEN,C48329
74,GREEN,C48329
87,GREEN,C48329


In [10]:
df[df[('HAS_IMAGE' == 1)]]

KeyError: False

In [25]:
df[(df['HAS_IMAGE'] == 1)].splshape_text.value_counts()

ROUND                 4132
OVAL                  2320
CAPSULE               1958
RECTANGLE               56
TRIANGLE                49
DIAMOND                 37
FREEFORM                21
SQUARE                  20
TRAPEZOID               19
HEXAGON (6 SIDED)       17
OCTAGON (8 SIDED)       12
PENTAGON (5 SIDED)      11
TEAR                     6
BULLET                   4
DOUBLE CIRCLE            4
SEMI-CIRCLE              1
Name: splshape_text, dtype: int64

In [38]:
df[(df['HAS_IMAGE'] == 1)].splshape_text.value_counts().sum()

8667