In [441]:
import pandas as pd
import numpy as np
import re

In [442]:
df = pd.read_csv('../camera_details.csv')

In [443]:
pd.set_option('display.max_columns', None)
df.head(1)

Unnamed: 0,Brand,Model,Also known as,Effective megapixels,Total megapixels,Sensor size,Sensor type,Sensor resolution,Max. image resolution,Crop factor,Optical zoom,Digital zoom,ISO,RAW support,Manual focus,Normal focus range,Macro focus range,Focal length (35mm equiv.),Aperture priority,Max aperture,Max. aperture (35mm equiv.),Metering,Exposure Compensation,Shutter priority,Min. shutter speed,Max. shutter speed,Built-in flash,External flash,Viewfinder,White balance presets,Screen size,Screen resolution,Video capture,Max. video resolution,Storage types,USB,HDMI,Wireless,GPS,Battery,Weight,Dimensions,Year,Image
0,Canon,EOS 850D,"EOS Rebel T8i (US), EOS Kiss X10i (Japan)",24.1,25.8,22.3 x 14.9 mm,CMOS,6012 x 4008,6000 x 4000,1.61,,,"Auto, 100-25600 (expandable to 51200)",Yes,Yes,,,,Yes,,,"Multi, Center-weighted, Spot, Partial","±5 EV (in 1/3 EV, 1/2 EV steps)",Yes,30 sec,1/4000 sec,Yes,Yes,Optical (pentamirror),6.0,"3""","1,040,000 dots",Yes,3840x2160 (25p),SD/SDHC/SDXC,USB 2.0 (480 Mbit/sec),Yes,Yes,No,LP-E17 lithium-ion battery,515 g,131 x 102.6 x 76.2 mm,2020,https://www.digicamdb.com/images/cameras/canon...


In [444]:
storage_types = df["Sensor type"].unique()
len(storage_types)
print(storage_types)

['CMOS' 'CCD' nan 'JFET' 'Foveon']


In [445]:
# Clean Storage Types
for i, row in df.iterrows():      
    types = row['Storage types']
    if isinstance(types, str):
        temp = []
        if ('SD' in types or 'Secure Digital' in types):
            if 'UHS-II' in types:
                temp.append('SD UHS-II')
            elif 'micro' in types or 'Micro' in types:
                temp.append('microSD')
            elif 'mini' in types or 'miniSD' in types or 'mini SD' in types:
                temp.append('microSD')
            elif 'XC' in types:
                temp.append('SD/SDHC/SDXC')
            elif 'HC' in types:
                temp.append('SD/SDHC')
            else:
                temp.append('SD/MMC')

        if 'Memory stick' in types or 'Memory Stick' in types or 'MS' in types:
            if 'tick Micro' in types:
                temp.append('Memory Stick Micro')
            else:
                temp.append('Memory Stick')

        if 'CF' in types or 'Compact Flash' in types or 'CompactFlash' in types:
            temp.append('Compact Flash')

        if 'xD' in types:
            temp.append('xD Picture Card')

        if 'SmartMedia' in types:
            temp.append('SmartMedia')

        if 'Internal' in types or  'internal' in types:
            temp.append('Internal Memory')

        if 'PCMCIA' in types:
            temp.append('PCMCIA')

        if 'CDR' in types:
            temp.append('CDR')

        if 'Disk 3.5' in types:
            temp.append('Disk 3.5')

        if 'XQD' in types:
            temp.append('XQD')
        
        df.at[i,'Storage types'] = temp;

In [446]:
def findFloatsInString(string):
    string = re.findall('[-+]?[0-9]*\.?[0-9]*',string)
    string = filter(lambda x: len(x)>0 and x not in ['.','-','+'] ,string)
    string = list(map(lambda x: abs(float(x)),string))
    return string

In [447]:
# Clean Camera Body Info
for i, row in df.iterrows():
    # Dimensions
    dimensions = row['Dimensions']
    if isinstance(dimensions, str):
        dimensions=dimensions.replace(',','.')
        dimensions = findFloatsInString(dimensions)
        df.at[i,'Dimensions'] = dimensions;

    # Weight
    weight = row['Weight']
    if isinstance(weight, str):
        weight = int(findFloatsInString(weight)[0])
        df.at[i,'Weight'] = weight

In [448]:
# Clean Sensor Info
for i, row in df.iterrows():
    # Size
    size = row['Sensor size']
    if isinstance(size, str):
        temp = size
        size = findFloatsInString(size)[-2:]
        df.at[i,'Sensor size'] = size
        
    # Resolution    
    res = row['Sensor resolution']
    if isinstance(res, str):
        temp = res
        res = findFloatsInString(res)[-2:]
        df.at[i,'Sensor resolution'] = res;
        
    # Crop Factor
    factor = row['Crop factor']
    factor = float(factor)
    df.at[i,'Crop factor'] = factor;
    
    # Sensor Type needs no modification
    
    # ISO
    iso = row['ISO']
    if isinstance(iso, str):
        temp = iso
        iso = findFloatsInString(iso)
        if len(iso) == 0:
            iso = None
        elif len(iso) > 1:
            iso = [min(iso),max(iso)]
        df.at[i,'ISO'] = iso;

In [449]:

for i, row in df.iterrows():
    # Clean Optical Zoom
    zoom = row['Optical zoom']
    if isinstance(zoom, str):
        if zoom == 'Yes':
            zoom = True
        elif zoom == 'No':
            zoom = None
        else:
            zoom = findFloatsInString(zoom)[0]
        df.at[i,'Optical zoom'] = zoom;
        
    # Clean Viewfinder
    viewfinder = row['Viewfinder']
    if isinstance(viewfinder, str):
        if 'Optical' in viewfinder and 'optional' not in viewfinder:
            df.at[i,'ovf'] = True;            
        if 'Electronic' in viewfinder and viewfinder =='Electronic (optional)' :
            df.at[i,'evf'] = True;
            
    # Clean Min Shutter Speed
    speed = row['Min. shutter speed']
    if isinstance(speed, str):
        speed = int(findFloatsInString(speed)[0])
        df.at[i,'Min. shutter speed'] = speed;
        
    # Clean Max Shutter Speed 
    speed = row['Max. shutter speed']
    if isinstance(speed, str):
        speed = int(findFloatsInString(speed)[1])
    df.at[i,'Max. shutter speed'] = speed;
    
    # Clean Max. video resolution
    video_res = row['Max. video resolution']
    if isinstance(video_res, str):
        video_res = [int(findFloatsInString(video_res)[0]),int(findFloatsInString(video_res)[1])]
        df.at[i,'Max. video resolution'] = video_res;

In [450]:
# Clean Screen Specs
for i, row in df.iterrows(): 
    # Screen Size
    scr_size = row['Screen size']
    if isinstance(scr_size, str):
        scr_size = int(findFloatsInString(scr_size)[0])
        df.at[i,'Screen size'] = scr_size
    
    # Screen Resolution
    scr_res = row['Screen resolution']
    if isinstance(scr_res, str):
        scr_res= scr_res.replace(',','')
        scr_res= scr_res.replace('.','')
        if 'x' in scr_res:
            scr_res = int(findFloatsInString(scr_res)[0])*int(findFloatsInString(scr_res)[1])
        else:
            scr_res = int(findFloatsInString(scr_res)[0])
        df.at[i,'Screen resolution'] = scr_res

In [451]:
# Clean Boolean Values

# Clean RAW support, Manual focus, Aperture priority, Shutter priority, Built-in flash, External flash, HDMI,Wireless,GPS
cols = ['RAW support', 'Manual focus', 'Aperture priority', 'Shutter priority', 'Built-in flash', 'External flash', 'HDMI','Wireless','GPS']
for i, row in df.iterrows():
    for col in cols:
        value = row[col]
        if isinstance(value, str) and value == 'Yes':
            value = True
        else:
            value = None
        df.at[i,col] = value;
    
    # USB
    value = row['USB']
    if isinstance(value, str):
        value = findFloatsInString(value)[0]
        df.at[i,'USB'] = value;
        

In [452]:
# Clean Battery 
for i, row in df.iterrows():
#     liion=['leica bp-dc8','battery pack nb-12l','bv-5xw','battery pack nb-13l']
    battery = row['Battery']
    if isinstance(battery, str):
        battery = battery.lower()
        if 'aa' in battery or 'nimh' in battery or 'alkaline' in battery:
            df.at[i,'Battery'] = 'AA/AAA'
        else:
            df.at[i,'Battery'] = 'Li-ion'

In [453]:
# Drop Unnecessary Columns
cols = ['Also known as', 'Effective megapixels','Viewfinder', 'Max. image resolution','Digital zoom','Normal focus range','Macro focus range','Max aperture','Max. aperture (35mm equiv.)','Metering','Exposure Compensation','White balance presets','Video capture']

df = df.drop(cols, axis = 1) 

# Rename Columns
df.rename(columns={'Brand':'brand',
                        'Model':'model',
                        'Total megapixels':'megapixels',
                        'Sensor size':'sensor_size',
                        'Sensor type':'sensor_type',
                        'Sensor resolution':'sensor_res',
                        'Crop factor':'crop',
                        'Optical zoom':'zoom',
                        'ISO':'iso',
                        'RAW support':'raw',
                        'Manual focus':'mf',
                        'Focal length (35mm equiv.)':'fl',
                        'Aperture priority':'ap',
                        'Shutter priority':'sp',
                        'Min. shutter speed':'min_shutter',
                        'Max. shutter speed':'max_shutter',
                        'Built-in flash':'int_flash',
                        'External flash':'ext_flash',
                        'Screen size':'scr_size',
                        'Screen resolution':'scr_res',
                        'Max. video resolution':'max_video_res',
                        'USB':'usb',
                        'HDMI':'hdmi',
                        'GPS':'gps',
                        'Battery':'battery',
                        'Weight':'weight',
                        'Dimensions':'dimensions',
                        'Year':'year',
                        'Image':'image'
                          }, 
                 inplace=True)

In [455]:
df.to_json('camera_details_cleaned.json',orient='records')

In [440]:
df.head()

Unnamed: 0,brand,model,megapixels,sensor_size,sensor_type,sensor_res,crop,zoom,iso,raw,mf,fl,ap,sp,min_shutter,max_shutter,int_flash,ext_flash,scr_size,scr_res,max_video_res,Storage types,usb,hdmi,Wireless,gps,battery,weight,dimensions,year,image,ovf,evf
0,Canon,EOS 850D,25.8,"[22.3, 14.9]",CMOS,"[6012.0, 4008.0]",1.61,,"[100.0, 51200.0]",True,True,,True,True,30,4000,True,True,3,1040000,"[3840, 2160]",[SD/SDHC/SDXC],2,True,True,,Li-ion,515,"[131.0, 102.6, 76.2]",2020,https://www.digicamdb.com/images/cameras/canon...,True,
1,Canon,EOS-1D X Mark III,21.4,"[36.0, 24.0]",CMOS,"[5492.0, 3661.0]",1.0,,"[50.0, 819200.0]",True,True,,True,True,30,8000,,True,3,2100000,"[5472, 2286]",[Compact Flash],3,True,True,True,Li-ion,1440,"[158.0, 167.6, 82.6]",2020,https://www.digicamdb.com/images/cameras/canon...,True,
2,Canon,EOS M200,25.8,"[22.3, 14.9]",CMOS,"[6012.0, 4008.0]",1.61,,"[100.0, 25600.0]",True,True,,True,True,30,4000,True,,3,1040000,"[3840, 2160]",[SD/SDHC/SDXC],2,True,True,,Li-ion,299,"[108.2, 67.1, 35.1]",2019,https://www.digicamdb.com/images/cameras/canon...,,
3,Canon,EOS 90D,34.4,"[22.3, 14.9]",CMOS,"[6983.0, 4655.0]",1.61,,"[100.0, 51200.0]",True,True,,True,True,30,8000,True,True,3,1040000,"[3840, 2160]",[SD/SDHC/SDXC],2,True,True,,Li-ion,701,"[140.7, 104.8, 76.8]",2019,https://www.digicamdb.com/images/cameras/canon...,True,
4,Canon,EOS M6 Mark II,34.4,"[22.3, 14.9]",CMOS,"[6983.0, 4655.0]",1.61,,"[100.0, 51200.0]",True,True,,True,True,30,4000,True,True,3,1040000,"[3840, 2160]",[SD/SDHC/SDXC],2,True,True,,Li-ion,408,"[119.6, 70.0, 49.2]",2019,https://www.digicamdb.com/images/cameras/canon...,,True
