# Cleaning data for TN_MVR_2018-2022 dataset


In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import time

# Path to data folders
## Folder that contains the raw data
OLD_PATH="./data/oldData/"
## Folder that contains the processed data
PROC_PATH="./data/processedData/"

# Import CSV data from datasets (tn_mvr_2018_2022)
df = pd.read_csv(f"{OLD_PATH}/tn_mvr_2018-2022.csv", sep="\t", dtype= {
                        "VIN": "str",
                        "VehicleCost":"float32",
                        "OdometerTypeCode":"category",
                        "OdometerReading":"Int32",
                        "CountyName":"category",
                        #"ZIP5",
                        "ModelYear":"int16",
                        #"MakeCode",
                        #"ModelCode",
                        "VehicleTypeDescription":"category",
                        "NewUsedCode":"category"
                        #"TitleIssueDate",
                        #"PurchaseDate"
                    }, low_memory=False)

In [2]:
# Check for total values
print("Dataframe Total Values (Before Dropping Missing):", df.shape[0])

# Check the data for missing values
print("Dataframe Total Missing Values:", df.isna().sum().sum())
print("Dataframe Columns Missing Values:\n", df.isna().sum())

# Remove the missing values (drop rows with missing values)
df = df.dropna()

# Check for total values
print("Dataframe Total Values (After Dropping Missing):", df.shape[0])

Dataframe Total Values (Before Dropping Missing): 9831774
Dataframe Total Missing Values: 333030
Dataframe Columns Missing Values:
 VIN                            4
VehicleCost               331544
OdometerTypeCode               0
OdometerReading                1
CountyName                     0
ZIP5                           0
ModelYear                      0
MakeCode                       0
ModelCode                    586
VehicleTypeDescription         0
NewUsedCode                    0
TitleIssueDate               895
PurchaseDate                   0
dtype: int64
Dataframe Total Values (After Dropping Missing): 9498752


In [3]:
# Rename the dataframe columns
df.columns = ["vin", "price", "odometer_type","mileage", "county", "zip", "model_year",
              "make", "model", "vehicle_type", "new_used", "title_issue_date", "purchase_date"]

# Set the date types
df["title_issue_date"] = pd.to_datetime(df["title_issue_date"], format="%Y-%m-%d")
df["purchase_date"] = pd.to_datetime(df["purchase_date"], format="%Y-%m-%d", errors="coerce")

In [4]:
# Print out the patterns of the dataset
print("Dataframe Shape and Columns:")
print(df.shape)
print(df.columns)
print()

# Check initial values of the dataset
print("Dataframe initial values:")
print(df.sample(2))
print()

Dataframe Shape and Columns:
(9498752, 13)
Index(['vin', 'price', 'odometer_type', 'mileage', 'county', 'zip',
       'model_year', 'make', 'model', 'vehicle_type', 'new_used',
       'title_issue_date', 'purchase_date'],
      dtype='object')

Dataframe initial values:
                       vin  price odometer_type  mileage    county    zip  \
4949566  4S4WMAPD4M3424651    0.0             0      401    Shelby  38104   
3327341        3A35Q326122    0.0             1        0  Sullivan  37664   

         model_year  make model vehicle_type new_used title_issue_date  \
4949566        2021  SUBA   ASC         AUTO        N       2021-03-24   
3327341        1973  FORD   TOR         AUTO        U       2018-01-31   

        purchase_date  
4949566    2021-01-29  
3327341    2017-09-14  



In [5]:
# Get basic info of the dataframe
print("Dataframe Info:")
print(df.info(show_counts=False))
print()

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9498752 entries, 0 to 9831773
Data columns (total 13 columns):
 #   Column            Dtype         
---  ------            -----         
 0   vin               object        
 1   price             float32       
 2   odometer_type     category      
 3   mileage           Int32         
 4   county            category      
 5   zip               object        
 6   model_year        int16         
 7   make              object        
 8   model             object        
 9   vehicle_type      category      
 10  new_used          category      
 11  title_issue_date  datetime64[ns]
 12  purchase_date     datetime64[ns]
dtypes: Int32(1), category(4), datetime64[ns](2), float32(1), int16(1), object(4)
memory usage: 643.2+ MB
None



In [6]:
# Describe the numeric columns
print("Numeric Columns:")
print(df.describe().round(decimals = 1))
print()

Numeric Columns:
              price       mileage  model_year
count  9.498752e+06     9498752.0   9498752.0
mean   2.163936e+06       38455.0      2010.4
std    2.792655e+09     4529582.0        10.0
min   -7.000000e+00  -225699640.0      1005.0
25%    0.000000e+00           0.0      2005.0
50%    0.000000e+00           8.0      2013.0
75%    1.000000e+03       39186.0      2018.0
max    8.118003e+12  2147483647.0      2890.0



In [7]:
# Describe the non-numeric columns
print("Non-numeric Columns:")
print(df.describe(include = ["object","category"]))
print()

Non-numeric Columns:
                  vin odometer_type    county      zip     make    model  \
count         9498752       9498752   9498752  9498752  9498752  9498752   
unique        7423553             5        97     5950     8568    24643   
top     CCZ264F146470             0  Davidson    37211     FORD      F15   
freq               17       4971518    812796   112367  1318066   280660   

       vehicle_type new_used  
count       9498752  9498752  
unique           16        2  
top            AUTO        U  
freq        6684615  7575502  



In [8]:
# Describe the dates
print("Date Columns:")
print(df.describe(include = ["datetime"], datetime_is_numeric=True))
print()

Date Columns:
                    title_issue_date                  purchase_date
count                        9498752                        9498181
mean   2020-07-01 05:41:17.225473536  2020-02-07 20:53:56.619558400
min              1971-06-12 00:00:00            1720-02-08 00:00:00
25%              2019-04-08 00:00:00            2018-12-28 00:00:00
50%              2020-07-21 00:00:00            2020-04-19 00:00:00
75%              2021-09-28 00:00:00            2021-07-14 00:00:00
max              2022-12-30 00:00:00            2220-03-07 00:00:00

