#### Exploring the make and model of cars in the numerical dataset

In [71]:
import pandas as pd
pd.set_option('display.max_columns', None)
carsNumerical="/Users/sachin/My Files/Data Science/Playground/group-coursework-sa3n/data/pre_encoded_price_categorized.csv"
car = pd.read_csv(carsNumerical)

unique_body_type = car['body_type'].unique()
unique_models = car['model_name'].unique()
unique_makes = car['make_name'].unique()

make_models_dict = car.groupby('make_name')['model_name'].unique().to_dict()

print("Number of models:")
print(len(unique_models),"\n")
print("Number of makes:")
print(len(unique_makes),"\n")
print("Makes:")
print(unique_makes,"\n")
print("Make-Model Dictionary for Jeep:")
print(make_models_dict['Jeep'])


Number of models:
629 

Number of makes:
52 

Makes:
['Jeep' 'Dodge' 'Hyundai' 'Chevrolet' 'BMW' 'Ford' 'Nissan' 'Toyota'
 'Volkswagen' 'Subaru' 'Cadillac' 'Honda' 'Acura' 'Mazda' 'MINI'
 'INFINITI' 'Kia' 'Jaguar' 'Lincoln' 'Chrysler' 'GMC' 'Mercedes-Benz'
 'Audi' 'Buick' 'Land Rover' 'Volvo' 'Lexus' 'Scion' 'Porsche' 'FIAT'
 'Mitsubishi' 'Lotus' 'Maserati' 'Rolls-Royce' 'Saturn' 'Alfa Romeo'
 'Mercury' 'Ferrari' 'Pontiac' 'Genesis' 'Hummer' 'Bentley' 'Saab'
 'Suzuki' 'smart' 'Oldsmobile' 'Lamborghini' 'Aston Martin' 'McLaren'
 'Isuzu' 'Plymouth' 'SRT'] 

Make-Model Dictionary for Jeep:
['Wrangler' 'Cherokee' 'Wrangler Unlimited' 'Grand Cherokee' 'Compass'
 'Renegade' 'Patriot' 'Liberty' 'Commander']


#### Importing the image dataset and verifying which all makes and models are available in the dataset

In [97]:
from scipy.io import loadmat
import os
import numpy as np
import pandas as pd


# Reset display options to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
# Load the MATLAB file
mat_file = '/Users/sachin/Desktop/archive/cars_annos.mat'
annotations = loadmat(mat_file)

# paths to the train and test folders
train_folder = '/Users/sachin/Desktop/archive/cars_test/cars_test/'
test_folder = '/Users/sachin/Desktop/archive/cars_train/cars_train/'

train_files = os.listdir(train_folder)
test_files = os.listdir(test_folder)

imageRef = np.array(annotations['annotations'])[0]
classNames = np.array(annotations['class_names'])[0]

print(annotations)

# print(type(imageRef))
# print(type(classNames))

imageDf=pd.DataFrame(imageRef)
classNameDf=pd.DataFrame(classNames)

#we could classify the type of car - SUV, Hatchback, Sedan etc..
#we could identify the make of the car - Ford, Audi, etc..

# now lets check which all makes of the numerical dataset is present in the image dataset
# print(classNameDf[0])

commonMakes=[]
imageMake_numericalMake={}
makesToBeIgnored=[]

for i in unique_makes:
    for j in classNameDf[0]:
        # print(i, "==" ,j[0])
        if i in j[0]:
            commonMakes.append(i)
            imageMake_numericalMake[j[0]]=i
    
            
# print(len(set(commonMakes)), "common makes are identified")
# print("They are:")
# # print(set(commonMakes))
print("A total of ",len(commonMakes)," models have been identified to have been made by ",len(set(commonMakes))," makes in the numerical dataset")
print("The image make and its corresponding numerical make are as follows:")

# this is a dictionary of makes which is common to both datasets
print(imageMake_numericalMake)


{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sat Feb 28 19:34:55 2015', '__version__': '1.0', '__globals__': [], 'annotations': array([[(array(['car_ims/000001.jpg'], dtype='<U18'), array([[112]], dtype=uint8), array([[7]], dtype=uint8), array([[853]], dtype=uint16), array([[717]], dtype=uint16), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
        (array(['car_ims/000002.jpg'], dtype='<U18'), array([[48]], dtype=uint8), array([[24]], dtype=uint8), array([[441]], dtype=uint16), array([[202]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
        (array(['car_ims/000003.jpg'], dtype='<U18'), array([[7]], dtype=uint8), array([[4]], dtype=uint8), array([[277]], dtype=uint16), array([[180]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8)),
        ...,
        (array(['car_ims/016183.jpg'], dtype='<U18'), array([[25]], dtype=uint8), array([[32]], dtype=uint8), array([[587]], dtype=uint16), array([[359]], dtype=uin

#### Now lets split the images into different folders based on the common makes

In [94]:
#Firstly lets assign a row number each 
classNameDf['RowNumber'] = classNameDf.reset_index().index + 1
# Add a new column 'Brand' based on the mapping
classNameDf['Brand'] = classNameDf[0].apply(lambda x: imageMake_numericalMake.get(x[0], 'Unknown'))

print(classNameDf)

print(imageDf)


for index, row in imageDf.iterrows():
    print(f"Index: {index}, relative_path: {row['relative_im_path']}, class: {row['class'][0][0]}, test: {row['test']}")


                                      0  RowNumber       Brand
0          [AM General Hummer SUV 2000]          1      Hummer
1                 [Acura RL Sedan 2012]          2       Acura
2                 [Acura TL Sedan 2012]          3       Acura
3                [Acura TL Type-S 2008]          4       Acura
4                [Acura TSX Sedan 2012]          5       Acura
..                                  ...        ...         ...
191  [Volkswagen Beetle Hatchback 2012]        192  Volkswagen
192          [Volvo C30 Hatchback 2012]        193       Volvo
193              [Volvo 240 Sedan 1993]        194       Volvo
194               [Volvo XC90 SUV 2007]        195       Volvo
195     [smart fortwo Convertible 2012]        196       smart

[196 rows x 3 columns]
           relative_im_path  bbox_x1 bbox_y1  bbox_x2  bbox_y2    class   test
0      [car_ims/000001.jpg]  [[112]]   [[7]]  [[853]]  [[717]]    [[1]]  [[0]]
1      [car_ims/000002.jpg]   [[48]]  [[24]]  [[441]]  [[202]]

## Findings:
The labels of cars are not matching with the actual cars. 
Other datasets do not have adequate data for brand classification.


#### In the notebook 'CNN Pre-Processing.ipynb', the new datasets and CNN models have been implemented.
Datasets used are:

    Car Damage Severity Dataset:
    https://www.kaggle.com/datasets/prajwalbhamere/car-damage-severity-dataset

    Car Body Type Classification Dataset:
    https://www.kaggle.com/datasets/ademboukhris/cars-body-type-cropped