# Read CSV and Create Tables

Import all necessary packages.

In [1]:
import pandas as pd
import numpy as np
import csv
import os

In [2]:
df = pd.DataFrame()
for file in os.listdir("."):
    if file.endswith("ver3.csv"):
        temp = pd.read_csv(file)
        df = pd.concat([df, temp], ignore_index=True)

df = df.drop(['DATE_ANNOTATED','TOTAL_NUM_MK','IDABLE_NUM_MK'], axis=1) # delete cage amd date_annotated column
print(df.shape)
df = df.sort_values(by=['FILE_NAME'])
df = df.reset_index(drop=True)
df.head(10)

(6751, 7)


Unnamed: 0,FILE_NAME,MKGROUP,MKIDS,TOYS,UNKNOWN_GUESS,COMMENTS,CAGE
0,20220308_SD02_2923.JPG,Stranger Things,[DF2I],,,,
1,20220308_SD02_2924.JPG,Stranger Things,[DF2I],,,,
2,20220308_SD02_2946.JPG,Zombies,"[7124, 110E, 151J]",,,,
3,20220308_SD02_2954.JPG,Zombies,"[7124, 110E, 151J]",,,,
4,20220308_SD02_2955.JPG,Zombies,"[7124, 110E, 151J]",,,,
5,20220308_SD02_2956.JPG,Zombies,"[7124, 110E, 151J]",,,,
6,20220308_SD02_2958.JPG,Zombies,[67G],,,,
7,20220308_SD02_2959.JPG,Zombies,[67G],,,,
8,20220308_SD02_2960.JPG,Zombies,[67G],,,,
9,20220308_SD02_2961.JPG,Zombies,[67G],,,,


In [3]:
# Change a few column names and generate jpg_id
df.columns = df.columns.str.lower()
df = df.rename(columns={'mkids': 'monkey_names'})
df = df.rename(columns={'mkgroup': 'monkey_group'})
df = df.reset_index().rename(columns={'index': 'jpg_id'})
df.head(5)

Unnamed: 0,jpg_id,file_name,monkey_group,monkey_names,toys,unknown_guess,comments,cage
0,0,20220308_SD02_2923.JPG,Stranger Things,[DF2I],,,,
1,1,20220308_SD02_2924.JPG,Stranger Things,[DF2I],,,,
2,2,20220308_SD02_2946.JPG,Zombies,"[7124, 110E, 151J]",,,,
3,3,20220308_SD02_2954.JPG,Zombies,"[7124, 110E, 151J]",,,,
4,4,20220308_SD02_2955.JPG,Zombies,"[7124, 110E, 151J]",,,,


In [4]:
photos_table = df[['jpg_id','file_name','monkey_group']].copy()
photos_table['sd_card'] = photos_table['file_name'].str.extract(r'_(SD\d+)_')
photos_table['date_taken'] = pd.to_datetime(photos_table['file_name'].str.extract(r'(\d{4})(\d{2})(\d{2})').apply(lambda x: '-'.join(x), axis=1))

# Original dataframe
original_columns = ['jpg_id', 'file_name', 'monkey_group','sd_card','date_taken']

# New column order
new_columns = ['jpg_id', 'file_name', 'sd_card','date_taken','monkey_group']

# Reorder columns
photos_table = photos_table[new_columns]

photos_table.tail(8)

Unnamed: 0,jpg_id,file_name,sd_card,date_taken,monkey_group
6743,6743,20220825_SD05_6949.JPG,SD05,2022-08-25,Instigators
6744,6744,20220825_SD05_6952.JPG,SD05,2022-08-25,Instigators
6745,6745,20220825_SD05_6954.JPG,SD05,2022-08-25,Instigators
6746,6746,20220825_SD05_6957.JPG,SD05,2022-08-25,Instigators
6747,6747,20220825_SD05_6958.JPG,SD05,2022-08-25,Instigators
6748,6748,20220825_SD05_6960.JPG,SD05,2022-08-25,Instigators
6749,6749,20220825_SD05_6963.JPG,SD05,2022-08-25,Instigators
6750,6750,20220825_SD05_6967.JPG,SD05,2022-08-25,Instigators


# Toy Table

In [5]:
toy_table = df[['jpg_id','toys']].copy().dropna()
toy_table['toys'] = toy_table['toys'].str.split(',')
toy_table = toy_table.explode('toys', ignore_index = True)
toy_table['toys'] = toy_table['toys'].str.replace(' ', '')
toy_table['toys'] = toy_table['toys'].str.replace('BlueFrisbees', 'BlueFrisbee')
toy_table = toy_table[~toy_table['toys'].str.contains('triangle|swing', case=False)]
toy_table = toy_table.rename(columns={'toys': 'toy'})
display(toy_table.head())
toy_table['toy'].unique()

Unnamed: 0,jpg_id,toy
0,38,Unknown
1,60,DentalStar
2,79,Dumbbell
3,163,Grenade
4,163,Grenade


array(['Unknown', 'DentalStar', 'Dumbbell', 'Grenade', 'Kong',
       'LuckyCloverChew', 'MegalastBall', 'Ziggs', 'BlueFrisbee',
       'DentalKong', 'Zyro', 'Hex', 'S-shapedToy', 'ChallengerBall'],
      dtype=object)

In [6]:
monkey_table = df[['jpg_id','monkey_names','unknown_guess']].copy()
monkey_table = monkey_table.sort_values(by=['jpg_id'])
monkey_table['monkey_names'] = monkey_table['monkey_names'].str.strip('[]')
monkey_table['monkey_names'] = monkey_table['monkey_names'].str.split(',')
monkey_table = monkey_table.explode('monkey_names')
monkey_table['monkey_names'] = monkey_table['monkey_names'].str.replace(' ', '')
unknown = monkey_table.copy()
# monkey_table = monkey_table.drop('unknown_guess', axis = 1)
# monkey_table.head(10)

Unnamed: 0,jpg_id,monkey_names
0,0,DF2I
1,1,DF2I
2,2,7124
2,2,110E
2,2,151J
3,3,7124
3,3,110E
3,3,151J
4,4,7124
4,4,110E


In [None]:
monkey_table[monkey_table['monkey_names'].str.contains('unknown', case=False)]

In [7]:
monkey_table.monkey_names.unique()

array(['DF2I', '7124', '110E', '151J', '67G', '94B', '72X', '143H', '69X',
       '87J', '14F', '19J', '101G', 'G701', '68F', '49Y', 'Unknown1',
       '79G', '68E', '167I', '86I', '59E', '134J', '35Y', 'Unknown2',
       'G942', '144H', '58I', '46J', '37I', '26J', '48Z', '70G', '114J',
       '42Z', '42K', '0EX', '36J', '0FL', '68Y', '81G', '40J', '09X',
       '58K', '59K', '67K', '98K', '120K', '126K', '139K', '122K', '129K',
       '170K'], dtype=object)

In [8]:
unknown = unknown[unknown['monkey_names'].str.contains('unknown', case=False)]
unknown = unknown.reset_index(drop=True)
unknown.head(30)

Unnamed: 0,jpg_id,monkey_names,unknown_guess
0,50,Unknown1,NN
1,60,Unknown1,"I, NN"
2,60,Unknown2,"I, NN"
3,64,Unknown1,"Juvenile, NN"
4,64,Unknown2,"Juvenile, NN"
5,65,Unknown1,Juvenile
6,69,Unknown1,Adult Female
7,70,Unknown1,Adult Female
8,73,Unknown1,J
9,74,Unknown1,J


In [9]:
for index, row in unknown.iterrows():
    if 'Unknown1' in row['monkey_names'] and ',' in row['unknown_guess']:
        parts = row['unknown_guess'].split(',')
        unknown1 = parts[0].strip()
        unknown.loc[index,'unknown_guess'] = str(unknown1)
    elif 'Unknown2' in row['monkey_names'] and ',' in row['unknown_guess']:
        parts = row['unknown_guess'].split(',')
        unknown2 = parts[1].strip()
        unknown.loc[index,'unknown_guess'] = unknown2

unknown.head(30)

Unnamed: 0,jpg_id,monkey_names,unknown_guess
0,50,Unknown1,NN
1,60,Unknown1,I
2,60,Unknown2,NN
3,64,Unknown1,Juvenile
4,64,Unknown2,NN
5,65,Unknown1,Juvenile
6,69,Unknown1,Adult Female
7,70,Unknown1,Adult Female
8,73,Unknown1,J
9,74,Unknown1,J


In [10]:
unknown.to_csv('../fromcsv_unknown.csv', index=False)
photos_table.to_csv('../fromcsv_photos.csv', index=False)
toy_table.to_csv('../fromcsv_toy.csv', index=False)
monkey_table.to_csv('../fromcsv_monkey.csv', index=False)

In [13]:
photos_table[photos_table.jpg_id==5467]

Unnamed: 0,jpg_id,file_name,sd_card,date_taken,monkey_group
5467,5467,20220725_SD05_4104.JPG,SD05,2022-07-25,Best Frans
