In [3]:
# Import required labels
import pandas as pd
import re

# Raw data for 2016 (3GB)
data_file = "data/test_result_2016.txt"

with open(data_file, 'r') as data_input:
    for lines in range(3):
        line = data_input.readline()
        print(line)

test_id|vehicle_id|test_date|test_class_id|test_type|test_result|test_mileage|postcode_area|make|model|colour|fuel_type|cylinder_capacity|first_use_date

1645480751|1374211238|2016-01-01|4|NT|P|117033|SM|VOLKSWAGEN|POLO|BLACK|PE|1600|2000-06-23

1393462389|1153769898|2016-01-01|4|NT|P|99292|NE|VOLKSWAGEN|PASSAT|BLUE|DI|1968|2006-11-30



In [4]:
# Initialize counts
sample_count = 0
testing_validate_count = 0
ensemble_count = 0

# Output files
training_file = "data/MOT_result_2016_training.csv"
validation_file = "data/MOT_result_2016_validation.csv"
ensemble_file = "data/MOT_result_2016_ensemble.csv"
testing_file = "data/MOT_result_2016_testing.csv"

# Sample Ratio (very large file, so sample just 1.5%)
sample = 0.015

# Split Ratio (10% for validation & testing, 5% for ensemble, 75% left for training)
testing_validation_split = 0.1
ensemble_split = 0.05

# Use write mode and headers for 1st dataframe only
mode = 'w'
header = True

# Process the file in chunks
chunksize = 10**5

for chunk in pd.read_csv(data_file, sep='|', chunksize=chunksize, error_bad_lines=False):
    if mode == 'w': # 1st dataframe sampled, set random state
        chunk = chunk.query('test_class_id == 4').sample(frac=sample, random_state = 21)
    else:
        chunk = chunk.query('test_class_id == 4').sample(frac=sample)
    # Split data into training, validation, ensemble & testing
    chunk_len = len(chunk)
    testing_validate_len = int(chunk_len * testing_validation_split)
    ensemble_len = int(chunk_len * ensemble_split)
    # Testing Data
    start = 0
    end = testing_validate_len
    chunk[start:end].to_csv(testing_file, index=False, mode=mode, header=header)
    # Ensemble Data
    start = end + 1
    end = testing_validate_len + ensemble_len
    chunk[start:end].to_csv(ensemble_file, index=False, mode=mode, header=header)
    # Validation Data
    start = end + 1
    end = 2 * testing_validate_len + ensemble_len
    chunk[start:end].to_csv(validation_file, index=False, mode=mode, header=header)
    # Training Data
    start = end + 1
    chunk[start:].to_csv(training_file, index=False, mode=mode, header=header)
    # Set mode for writing CSV file to append for subsequent samples & don't rewrite headers
    mode = 'a'
    header = False
    # Update counts
    sample_count += chunk_len
    testing_validate_count += testing_validate_len
    ensemble_count += ensemble_len

# Subtracting testing and validation counts to get training counts
training_count = sample_count - 2 * testing_validate_count
print('The number of samples taken was ' + "{:,}".format(sample_count))
print('This was split as follows:')
print('  Training - ' + "{:,}".format(training_count))
print('  Validation - ' + "{:,}".format(testing_validate_count))
print('  Ensemble - ' + "{:,}".format(ensemble_count))
print('  Testing - ' + "{:,}".format(testing_validate_count))

b'Skipping line 35021189: expected 14 fields, saw 16\n'
b'Skipping line 37070120: expected 14 fields, saw 16\n'


The number of samples taken was 534,557
This was split as follows:
  Training - 427,993
  Validation - 53,282
  Ensemble - 26,548
  Testing - 53,282


In [5]:
postcode_file = "data/National_Statistics_Postcode_Lookup_UK.csv"
postcode_area_file = "data/Postcode Area.csv"

postcode_df = pd.read_csv(postcode_file)
postcode_df['Postcode Area'] = (postcode_df['Postcode 3'].str.extract('([A-Z]+)', expand=True))
postcode_df = postcode_df.groupby(['Postcode Area']).first()[['Local Authority Name', 'Country Name', 'Region Name']]

postcode_df.loc[postcode_df['Country Name'] == 'England','Country_Region'] = postcode_df.loc[postcode_df['Country Name'] == 'England','Region Name']
postcode_df.loc[postcode_df['Country Name'] != 'England','Country_Region'] = postcode_df.loc[postcode_df['Country Name'] != 'England','Country Name']
postcode_df.to_csv(postcode_area_file)
postcode_df.head(10)


Unnamed: 0_level_0,Local Authority Name,Country Name,Region Name,Country_Region
Postcode Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AB,Aberdeen City,Scotland,,Scotland
AL,St Albans,England,East of England,East of England
B,Birmingham,England,West Midlands,West Midlands
BA,Bath and North East Somerset,England,South West,South West
BB,Hyndburn,England,North West,North West
BD,Bradford,England,Yorkshire and The Humber,Yorkshire and The Humber
BH,Poole,England,South West,South West
BL,Bolton,England,North West,North West
BN,Lewes,England,South East,South East
BR,Bromley,England,London,London
