# Read CSV and Create Tables

Import all necessary packages.

In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
# Read CSV file and look at the data
ZBcsv = pd.read_csv("./ZBcsvAfterBB/ZB_newcol_updated.csv")
ZBcsv.head(5)

Unnamed: 0,FILE_NAME,DATE_ANNOTATED,MKGROUP,TOTAL_NUM_MK,IDABLE_NUM_MK,MKIDS,TOYS,UNKNOWN1_GUESS,UNKNOWN2_GUESS,COMMENTS
0,20220308_SD02_2946.JPG,2022-06-27,Zombies,3,3,"[7124, 110E, 151J]",,,,
1,20220308_SD02_2954.JPG,2022-06-27,Zombies,3,3,"[7124, 110E, 151J]",,,,
2,20220308_SD02_2955.JPG,2022-06-27,Zombies,3,3,"[7124, 110E, 151J]",,,,
3,20220308_SD02_2956.JPG,2022-06-27,Zombies,3,3,"[7124, 110E, 151J]",,,,
4,20220308_SD02_2958.JPG,2022-06-27,Zombies,1,1,[67G],,,,
5,20220308_SD02_2959.JPG,2022-06-27,Zombies,1,1,[67G],,,,
6,20220308_SD02_2960.JPG,2022-06-27,Zombies,1,1,[67G],,,,


In [3]:
# Drop some unnecessary columns
ZBcsv = ZBcsv.drop(columns=['DATE_ANNOTATED', 'TOTAL_NUM_MK','IDABLE_NUM_MK'])

In [4]:
# Change a few column names and generate jpg_id
ZBcsv.columns = ZBcsv.columns.str.lower()
ZBcsv = ZBcsv.rename(columns={'mkids': 'monkey_ids'})
ZBcsv = ZBcsv.rename(columns={'mkgroup': 'monkey_group'})
ZBcsv = ZBcsv.sort_values(by=['file_name'])
ZBcsv = ZBcsv.reset_index(drop=True)
ZBcsv.reset_index(inplace=True)
ZBcsv = ZBcsv.rename(columns={'index': 'jpg_id'})
ZBcsv.tail(5)

Unnamed: 0,jpg_id,file_name,monkey_group,monkey_ids,toys,unknown1_guess,unknown2_guess,comments
1601,1601,20220804_SD05_5934.JPG,Zombies,[143H],,,,
1602,1602,20220804_SD05_5935.JPG,Zombies,"[143H, Unknown1]",,NN,,
1603,1603,20220804_SD05_5937.JPG,Zombies,[143H],,,,
1604,1604,20220804_SD05_5938.JPG,Zombies,[69X],,,,
1605,1605,20220804_SD05_5939.JPG,Zombies,[143H],,,,


In [5]:
# Make a new column for cage structure
ZBcsv["cage_structure"] = ZBcsv['toys'].str.extract("(triangle|Swing)")
print(ZBcsv.iloc[330:340,:])

     jpg_id               file_name monkey_group        monkey_ids       toys  \
330     330  20220607_SD04_6320.JPG      Zombies             [67G]        NaN   
331     331  20220607_SD04_6321.JPG      Zombies             [67G]        NaN   
332     332  20220607_SD04_6322.JPG      Zombies             [67G]        NaN   
333     333  20220607_SD04_6325.JPG      Zombies             [67G]        NaN   
334     334  20220607_SD04_6326.JPG      Zombies             [67G]        NaN   
335     335  20220607_SD04_6327.JPG      Zombies             [67G]        NaN   
336     336  20220607_SD04_6328.JPG      Zombies             [67G]        NaN   
337     337  20220607_SD04_6329.JPG      Zombies             [67G]  Toy-Swing   
338     338  20220607_SD04_6330.JPG      Zombies       [7124, 67G]  Toy-Swing   
339     339  20220607_SD04_6335.JPG      Zombies  [7124, 94B, 98K]        NaN   

    unknown1_guess unknown2_guess comments cage_structure  
330            NaN            NaN      NaN      

In [20]:
# Extract Toys column and clean up
toys = ZBcsv['toys']
toys = toys.str.replace('Toy-', '')
toys = toys.replace('triangle', np.nan)
toys = toys.replace('Swing', np.nan)
ZBcsv['toys'] = toys

In [23]:
# Create Toys table
toys_no_nan = ZBcsv[~ZBcsv['toys'].isna()]
toys_table = toys_no_nan[['jpg_id', 'file_name', 'toys']].copy()
toys_table['toys'] = toys_table['toys'].str.split(',')
toys_table = toys_table.explode('toys')
toys_table

Unnamed: 0,jpg_id,file_name,toys
53,53,20220415_SD01_4202.JPG,Lucky Clover Chew
53,53,20220415_SD01_4202.JPG,Unknown
66,66,20220421_SD02_4865.JPG,Kong
67,67,20220421_SD02_4866.JPG,Kong
72,72,20220421_SD02_4880.JPG,Kong
...,...,...,...
1591,1591,20220804_SD05_5914.JPG,Ziggs
1595,1595,20220804_SD05_5923.JPG,Ziggs
1596,1596,20220804_SD05_5924.JPG,Ziggs
1597,1597,20220804_SD05_5926.JPG,Ziggs


In [26]:
# Create Cage table
cage_no_nan = ZBcsv[~ZBcsv['cage_structure'].isna()]
cage_table = cage_no_nan[['jpg_id','file_name', 'cage_structure']].copy()
cage_table['cage_structure'] = cage_table['cage_structure'].str.split(',')
cage_table = cage_table.explode('cage_structure')
cage_table['cage_structure'] = cage_table['cage_structure'].apply(lambda x: x.capitalize() if x == 'triangle' else x)
cage_table

Unnamed: 0,jpg_id,file_name,cage_structure
52,52,20220415_SD01_4198.JPG,Triangle
78,78,20220421_SD02_4888.JPG,Swing
79,79,20220421_SD02_4889.JPG,Swing
80,80,20220421_SD02_4890.JPG,Swing
124,124,20220426_SD02_5338.JPG,Swing
...,...,...,...
1512,1512,20220804_SD05_5607.JPG,Triangle
1513,1513,20220804_SD05_5611.JPG,Triangle
1544,1544,20220804_SD05_5802.JPG,Triangle
1545,1545,20220804_SD05_5803.JPG,Triangle


# Update Original Table

Original table should have FILE_NAME, DATE_ANNOTATED, MKGROUP, TOTAL_NUM_MK, IDABLE_NUM_MK, MKIDS. Update TOTAL_NUM_MK, IDABLE_NUM_MK columns based on the MKIDS values. 

In [None]:
# Extract and delete brackets in MKIDS 
Mk = ZBcsv.iloc[:,0:7]
Mk['monkey_ids'] = Mk['monkey_ids'].apply(lambda x: x.strip('[]'))
print(Mk)

In [None]:
# Update Total_Num_mk column
Mk['total_num_mk'] = Mk['monkey_ids'].apply(lambda x: len(x.split(',')))
print(Mk)

In [None]:
# Update IDable_Num_Mk column
Mk['UNKNOWN_COUNT'] = Mk['monkey_ids'].str.extractall('(Unknown1|Unknown2)').groupby(level=0).size()
Mk['UNKNOWN_COUNT'] = Mk['UNKNOWN_COUNT'].fillna(0)
Mk['UNKNOWN_COUNT'] = Mk['UNKNOWN_COUNT'].astype('int')
Mk['idable_num_mk'] = Mk['total_num_mk'] - Mk['UNKNOWN_COUNT']
display(Mk.tail(10))

monkey_table = Mk.iloc[:,0:7]
display(monkey_table.head(10))

# Create Unknown Table

In [None]:
unknown = ZBcsv.iloc[:,[0,1,6,8,9]].copy()
unknown['monkey_ids'] = unknown['monkey_ids'].str.strip('[]') 
unknown.tail(5)

In [None]:
unknown['unknown_guess'] = unknown['unknown1_guess'].astype(str) + ', ' + unknown['unknown2_guess'].astype(str)
unknown.tail(5)

In [None]:
all_unknowns_df = unknown.loc[~unknown['unknown_guess'].str.contains('nan, nan')]
all_unknowns_df

In [None]:
unknown_table = all_unknowns_df[['jpg_id','file_name','unknown_guess']].copy()
unknown_table['unknown_guess']= unknown_table['unknown_guess'].str.strip(', nan')
unknown_table.head(20)

In [None]:
toys_table.to_csv('ZB_toys.csv')
cage_table.to_csv('ZB_cage.csv')
unknown_table.to_csv('ZB_unknown.csv')
monkey_table.to_csv('processed_ZBoriginal.csv')