Created on Mon Jan 6 09:58:13 2020
<br>
Group 7
<br>
@authors: M.D., C.D., E.G.

In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib.pyplot import imread

In [10]:
project_path = './../'
data_path = 'Interpromo2020/All Data/ANALYSE IMAGE/IMG SEATGURU/'
path = project_path + data_path
img_list = os.listdir(path)
nb_images = len(img_list)

## Dataframe

In [11]:
# list of all matrices (images)
imgs = list()
for i in range(nb_images):
    img = imread(path + img_list[i])
    imgs.append(img)
    
# init the dataframe that will contain all basic info about images
imgs_df = pd.DataFrame(columns = ['name', 'format', 'height', 'width', 'height_to_width', 'ncol', 'aircraft_manufacturer', 'aircraft_type'])
imgs_df.name = img_list

In [12]:
def get_imgs_formats(imgs: list, imgs_df: pd.DataFrame) -> pd.DataFrame:
    
    """
    Parameters:
    imgs: list of arrays representing images
    imgs_df: empty DataFrame with at least a name column 
    
    Out:
    imgs_df: DataFrame with 4 new columns: format, height, width, height_to_width, ncol, and one line per image
    
    """
    
    # formats
    imgs_df['format'] = imgs_df.name.apply(lambda x: x.split('.')[1])
    
    # shapes
    heights = [imgs[k].shape[0] if len(imgs[k].shape) != 0 else 0 for k in range(len(imgs))]
    widths = [imgs[k].shape[1] if len(imgs[k].shape) != 0 else 0 for k in range(len(imgs))]
    ncols = [imgs[k].shape[2] if len(imgs[k].shape) != 0 else 0 for k in range(len(imgs))]
    
    imgs_df['height'] = heights
    imgs_df['width'] = widths
    imgs_df['height_to_width'] = imgs_df[['height', 'width']].apply(lambda x: x[0] / x[1] if x[1] != 0 else 0, axis=1) 
    imgs_df['ncol'] = ncols
    
    return imgs_df

In [13]:
# apply function to fill in DataFrame with format, height, width, height_to_width, and ncol
imgs_df = get_imgs_formats(imgs, imgs_df)

# aircraft_manufacturers
aircraft_manufacturers = ['Airbus' if 'Airbus' in imgs_df.name[k] 
              else 'Boeing' if 'Boeing' in imgs_df.name[k] 
              else 'Other' for k in range(len(imgs_df))]

# aircraft types
aircraft_types = [name.split('_')[name.split('_').index(aircraft_manufacturer) + 1].split('-')[0] if aircraft_manufacturer in name.split('_') else ''
                  for name, aircraft_manufacturer in zip(imgs_df.name, aircraft_manufacturers)]

# add missing As for Airbus aircrafts 
aircraft_types = [aircraft_types[k] if ('A' in aircraft_types[k] or '7' in aircraft_types[k] or aircraft_types[k] == '') 
                  else 'A' + aircraft_types[k] for k in range(len(aircraft_types))]

# remove Airbus 'neos' because we don't need that much detail
aircraft_types = [aircraft_types[k].replace('neo', '') for k in range(len(aircraft_types))]

# remove Ds
aircraft_types = [aircraft_types[k].replace('D', '') for k in range(len(aircraft_types))]

# fill in dataframe columns
imgs_df.aircraft_manufacturer = aircraft_manufacturers
imgs_df.aircraft_type = aircraft_types

In [14]:
imgs_df.head()

Unnamed: 0,name,format,height,width,height_to_width,ncol,aircraft_manufacturer,aircraft_type
0,Cathay_Pacific_Airways_Boeing_777-300ER_C_0.jpg,jpg,540,960,0.5625,3,Boeing,777
1,KLM_Airbus_A330-300_1.jpg,jpg,720,960,0.75,3,Airbus,A330
2,American_Airlines_Boeing_767-300_3.jpg,jpg,720,720,1.0,3,Boeing,767
3,Air_Canada_Boeing_767-300ER_v2_3.jpg,jpg,720,960,0.75,3,Boeing,767
4,United_Airlines_Q400_A_2.jpg,jpg,720,960,0.75,3,Other,


In [24]:
# save as csv
path_tr =  './../CSV_annotate/SEATGURU/'
os.makedirs(path_tr, exist_ok = True)
imgs_df.to_csv(path_or_buf=path_tr + 'g7_SEATGURU.csv', sep=';', encoding='utf-8', index = False)

## Descriptive statistics

In [19]:
print(f'{len(np.unique(imgs_df.format))} unique image format(s).')
print(f'{len(np.unique(imgs_df.ncol))} unique ncol(s).')
print(f'{len(np.unique(imgs_df.height_to_width))} unique height_to_width(s).')
print(f'{len(np.unique(imgs_df.aircraft_type))} unique aircraft type(s).')

1 unique image format(s).
1 unique ncol(s).
128 unique height_to_width(s).
20 unique aircraft type(s).


In [20]:
print(f'{len(np.arange(nb_images)[imgs_df.aircraft_manufacturer == "Airbus"])} Airbus labelled images.')
print(f'{len(np.arange(nb_images)[imgs_df.aircraft_manufacturer == "Boeing"])} Boeing labelled images.')
print(f'{len(np.arange(nb_images)[imgs_df.aircraft_manufacturer == "Other"])} others.')

1043 Airbus labelled images.
1112 Boeing labelled images.
401 others.


In [21]:
def get_stats(imgs_df: pd.DataFrame, col: pd.Series, col_name: str):
    
    print(f'Max {col_name}: {np.max(col)}')
    print(f'Median {col_name}: {np.median(col)}')
    print(f'Min {col_name}: {np.min(col)}')
    print('')

In [22]:
# dimensions
get_stats(imgs_df=imgs_df, col=imgs_df.height_to_width, col_name='height_to_width')
get_stats(imgs_df=imgs_df, col=imgs_df.height, col_name='height')
get_stats(imgs_df=imgs_df, col=imgs_df.width, col_name='width')

Max height_to_width: 3.898823529411765
Median height_to_width: 1.0
Min height_to_width: 0.2760416666666667

Max height: 1657
Median height: 720.0
Min height: 265

Max width: 960
Median width: 720.0
Min width: 337



In [23]:
# number of images per aircraft type
pd.DataFrame(pd.pivot_table(imgs_df,
                            index=['aircraft_manufacturer', 'aircraft_type'],
                            aggfunc='count').format.sort_values(ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,format
aircraft_manufacturer,aircraft_type,Unnamed: 2_level_1
Other,,401
Boeing,777,344
Airbus,A330,331
Boeing,737,298
Airbus,A320,244
Boeing,787,198
Airbus,A321,152
Boeing,767,131
Airbus,A319,110
Airbus,A380,73
