# Cleaning data for TN_MVR_2018-2022 dataset


In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import time

# Path to data folders
## Folder that contains the raw data
OLD_PATH="./data/oldData/"
## Folder that contains the processed data
PROC_PATH="./data/processedData/"

# Import CSV data from datasets (tn_mvr_2018_2022)
df = pd.read_csv(f"{OLD_PATH}/tn_mvr_2018-2022.csv", sep="\t", dtype= {
                        "VIN": "str",
                        "VehicleCost":"float32",
                        "OdometerTypeCode":"category",
                        "OdometerReading":"Int32",
                        "CountyName":"category",
                        #"ZIP5",
                        "ModelYear":"int16",
                        #"MakeCode",
                        #"ModelCode",
                        "VehicleTypeDescription":"category",
                        "NewUsedCode":"category"
                        #"TitleIssueDate",
                        #"PurchaseDate"
                    }, low_memory=False)

In [2]:
# Check the data for missing values
print("Dataframe Total Missing Values:", df.isna().sum().sum())
print("Dataframe Columns Missing Values:\n", df.isna().sum())

Dataframe Total Missing Values: 333030
Dataframe Columns Missing Values:
 VIN                            4
VehicleCost               331544
OdometerTypeCode               0
OdometerReading                1
CountyName                     0
ZIP5                           0
ModelYear                      0
MakeCode                       0
ModelCode                    586
VehicleTypeDescription         0
NewUsedCode                    0
TitleIssueDate               895
PurchaseDate                   0
dtype: int64


In [3]:
# Rename the dataframe columns
df.columns = ["vin", "price", "odometer_type","mileage", "county", "zip", "model_year",
              "make", "model", "vehicle_type", "new_used", "title_issue_date", "purchase_date"]

# Set the date types
df["title_issue_date"] = pd.to_datetime(df["title_issue_date"], format="%Y-%m-%d")
df["purchase_date"] = pd.to_datetime(df["purchase_date"], format="%Y-%m-%d", errors="coerce")

In [4]:
# Print out the patterns of the dataset
print("Dataframe Shape and Columns:")
print(df.shape)
print(df.columns)
print()

# Check initial values of the dataset
print("Dataframe initial values:")
print(df.sample(2))
print()

Dataframe Shape and Columns:
(9831774, 13)
Index(['vin', 'price', 'odometer_type', 'mileage', 'county', 'zip',
       'model_year', 'make', 'model', 'vehicle_type', 'new_used',
       'title_issue_date', 'purchase_date'],
      dtype='object')

Dataframe initial values:
                       vin  price odometer_type  mileage  county    zip  \
1472784  1GCHK23123F115066  500.0             1        0   Lewis  38462   
5856696  KM8JUCAC4CU451332    0.0             0    25818  Sumner  37075   

         model_year  make model vehicle_type new_used title_issue_date  \
1472784        2003  CHEV   SIL         AUTO        U       2020-10-19   
5856696        2012  HYUN   TCN         AUTO        U       2019-05-22   

        purchase_date  
1472784    2020-08-08  
5856696    2013-04-18  



In [5]:
# Get basic info of the dataframe
print("Dataframe Info:")
print(df.info(show_counts=False))
print()

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9831774 entries, 0 to 9831773
Data columns (total 13 columns):
 #   Column            Dtype         
---  ------            -----         
 0   vin               object        
 1   price             float32       
 2   odometer_type     category      
 3   mileage           Int32         
 4   county            category      
 5   zip               object        
 6   model_year        int16         
 7   make              object        
 8   model             object        
 9   vehicle_type      category      
 10  new_used          category      
 11  title_issue_date  datetime64[ns]
 12  purchase_date     datetime64[ns]
dtypes: Int32(1), category(4), datetime64[ns](2), float32(1), int16(1), object(4)
memory usage: 590.7+ MB
None



In [6]:
# Describe the numeric columns
print("Numeric Columns:")
print(df.describe().round(decimals = 1))
print()

Numeric Columns:
              price       mileage  model_year
count  9.500230e+06     9831773.0   9831774.0
mean   2.163599e+06         190.1      2010.4
std    2.792438e+09     4452238.4         9.9
min   -7.000000e+00  -225699640.0      1005.0
25%    0.000000e+00           0.0      2005.0
50%    0.000000e+00          10.0      2013.0
75%    1.000000e+03       43145.0      2018.0
max    8.118003e+12  2147483647.0      2890.0



In [7]:
# Describe the non-numeric columns
print("Non-numeric Columns:")
print(df.describe(include = ["object","category"]))
print()

Non-numeric Columns:
                  vin odometer_type   county      zip     make    model  \
count         9831770       9831774  9831774  9831774  9831774  9831188   
unique        7558172             5       97     5968     8580    24683   
top     CCZ264F146470             0    State    37218     FORD      F15   
freq               17       5185393  1086116   172902  1360587   286477   

       vehicle_type new_used  
count       9831774  9831774  
unique           16        2  
top            AUTO        U  
freq        6973439  7907848  



In [8]:
# Describe the dates
print("Date Columns:")
print(df.describe(include = ["datetime"], datetime_is_numeric=True))
print()

Date Columns:
                    title_issue_date                  purchase_date
count                        9830879                        9831203
mean   2020-07-08 19:08:27.796668416  2020-02-19 13:33:20.368216320
min              1971-06-12 00:00:00            1720-02-08 00:00:00
25%              2019-04-23 00:00:00            2019-01-15 00:00:00
50%              2020-07-29 00:00:00            2020-05-01 00:00:00
75%              2021-10-04 00:00:00            2021-07-23 00:00:00
max              2022-12-30 00:00:00            2220-03-07 00:00:00

