# Finalized Class Statistics

Finalized Class by Train-Test Split

In [2]:
# Packages
import pandas as pd
import os
import dataframe_image as dfi # NOTE: YOU MUST HAVE GOOGLE CHROME INSTALLED FOR THIS TO WORK CORRECTLY

In [3]:
# Function for loading parquet files and loading the Class column
def combine_directory_parquets_read_class(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f, columns=['Class']) for f in file_list])
    # Return combined dataframe
    return combined_df

## Load Data

In [4]:
train_data = combine_directory_parquets_read_class('../../../Data/Features/All Features/train/')
# Add column for 'Split' with value 'Train'
train_data['Split'] = 'Train'
train_data

Unnamed: 0,Class,Split
0,Sedan,Train
1,SUV,Train
2,Convertible,Train
3,Pickup,Train
4,SUV,Train
...,...,...
370,SUV,Train
371,Sedan,Train
372,Sedan,Train
373,Sedan,Train


In [5]:
test_data = combine_directory_parquets_read_class('../../../Data/Features/All Features/test/')
# Add column for 'Split' with value 'Test'
test_data['Split'] = 'Test'
test_data

Unnamed: 0,Class,Split
0,Pickup,Test
1,SUV,Test
2,Convertible,Test
3,Convertible,Test
4,Pickup,Test
...,...,...
88,Sedan,Test
89,Pickup,Test
90,Sedan,Test
91,Sedan,Test


## Stack Dataframes and Create Counts

In [9]:
test_train_class_cnts = (pd.concat([train_data, test_data])
                           .groupby(['Split', 'Class'])
                           .size()
                           .reset_index(name='Count')
                           .sort_values(by=['Split', 'Class'], ascending=[False, True])
                           # Add percent
                           .assign(Percent = lambda x: (x['Count'] / x.groupby('Split')['Count'].transform('sum') * 100))
                           .style
                           .format(precision=2, thousands=",", decimal=".")
                           .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                           .set_properties(**{'text-align': 'center'})
                           .hide(axis="index")
)

dfi.export(test_train_class_cnts, '../../../Output/Class Statistics/Finalized/test_train_class_cnts.png')

# Output to Excel as well
test_train_class_cnts.to_excel('../../../Output/Class Statistics/Finalized/test_train_class_cnts.xlsx', index=False)

test_train_class_cnts

Split,Class,Count,Percent
Train,Convertible,1140,18.99
Train,Pickup,713,11.88
Train,SUV,1616,26.92
Train,Sedan,2534,42.21
Test,Convertible,279,18.74
Test,Pickup,189,12.69
Test,SUV,441,29.62
Test,Sedan,580,38.95
