# Cleaning data for TN_MVR_2018-2022 dataset


In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import time

# Path to data folders
## Folder that contains the raw data
OLD_PATH="./data/oldData/"
## Folder that contains the processed data
PROC_PATH="./data/processedData/"

# Import CSV data from datasets (tn_mvr_2018_2022)
df = pd.read_csv(f"{OLD_PATH}/tn_mvr_2018-2022.csv", sep="\t",  low_memory=False)

In [2]:
# Check for total values
print("Dataframe Total Values (Before Dropping Missing):", df.shape[0])

# Check the data for missing values
print("Dataframe Total Missing Values:", df.isna().sum().sum())
print("Dataframe Columns Missing Values:\n", df.isna().sum())

# Remove the missing values (drop rows with missing values)
df = df.dropna()

# Check for total values
print("Dataframe Total Values (After Dropping Missing):", df.shape[0])

Dataframe Total Values (Before Dropping Missing): 9831774
Dataframe Total Missing Values: 333030
Dataframe Columns Missing Values:
 VIN                            4
VehicleCost               331544
OdometerTypeCode               0
OdometerReading                1
CountyName                     0
ZIP5                           0
ModelYear                      0
MakeCode                       0
ModelCode                    586
VehicleTypeDescription         0
NewUsedCode                    0
TitleIssueDate               895
PurchaseDate                   0
dtype: int64
Dataframe Total Values (After Dropping Missing): 9498752


In [3]:
# Rename the dataframe columns
df.columns = ["vin", "price", "odometer_type","mileage", "county", "zip", "model_year",
              "make", "model", "vehicle_type", "new_used", "title_issue_date", "purchase_date"]

# Set the date types
df["title_issue_date"] = pd.to_datetime(df["title_issue_date"], format="%Y-%m-%d")
df["purchase_date"] = pd.to_datetime(df["purchase_date"], format="%Y-%m-%d", errors="coerce")

In [4]:
# Print out the patterns of the dataset
print("Dataframe Shape and Columns:")
print(df.shape)
print(df.columns)
print()

# Check initial values of the dataset
print("Dataframe initial values:")
print(df.sample(2))
print()

Dataframe Shape and Columns:
(9498752, 13)
Index(['vin', 'price', 'odometer_type', 'mileage', 'county', 'zip',
       'model_year', 'make', 'model', 'vehicle_type', 'new_used',
       'title_issue_date', 'purchase_date'],
      dtype='object')

Dataframe initial values:
                       vin    price  odometer_type  mileage     county    zip  \
982097   1FT7W2BT7JEB04604  60000.0              0  23755.0    Haywood  38012   
3019523  2G1WX15K329339147      0.0              1      0.0  Robertson  37073   

         model_year  make model vehicle_type new_used title_issue_date  \
982097         2018  FORD   F25        TRUCK        U       2020-01-10   
3019523        2002  CHEV   MSS         AUTO        U       2019-09-10   

        purchase_date  
982097     2019-12-27  
3019523    2019-08-10  



In [5]:
# Get basic info of the dataframe
print("Dataframe Info:")
print(df.info(show_counts=False))
print()

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9498752 entries, 0 to 9831773
Data columns (total 13 columns):
 #   Column            Dtype         
---  ------            -----         
 0   vin               object        
 1   price             float64       
 2   odometer_type     int64         
 3   mileage           float64       
 4   county            object        
 5   zip               object        
 6   model_year        int64         
 7   make              object        
 8   model             object        
 9   vehicle_type      object        
 10  new_used          object        
 11  title_issue_date  datetime64[ns]
 12  purchase_date     datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(2), object(7)
memory usage: 1014.6+ MB
None



In [11]:
# Describe the numeric columns
print("Numeric Columns:")
print(df.describe().round(decimals = 1))
print()

Numeric Columns:
              price  odometer_type       mileage  model_year
count  9.498752e+06      9498752.0  9.498752e+06   9498752.0
mean   2.163936e+06            0.5  3.845500e+04      2010.4
std    2.792655e+09            0.8  4.529582e+06        10.0
min   -7.000000e+00            0.0 -2.256996e+08      1005.0
25%    0.000000e+00            0.0  0.000000e+00      2005.0
50%    0.000000e+00            0.0  8.000000e+00      2013.0
75%    1.000000e+03            1.0  3.918600e+04      2018.0
max    8.118003e+12            9.0  2.147484e+09      2890.0



In [7]:
# Describe the non-numeric columns
print("Non-numeric Columns:")
print(df.describe(include = ["object","category"]))
print()

Non-numeric Columns:
                  vin    county      zip     make    model vehicle_type  \
count         9498752   9498752  9498752  9498752  9498752      9498752   
unique        7423553        97     5950     8568    24643           16   
top     CCZ264F146470  Davidson    37211     FORD      F15         AUTO   
freq               17    812796   112367  1318066   280660      6684615   

       new_used  
count   9498752  
unique        2  
top           U  
freq    7575502  



In [8]:
# Describe the dates
print("Date Columns:")
print(df.describe(include = ["datetime"], datetime_is_numeric=True))
print()

Date Columns:
                    title_issue_date                  purchase_date
count                        9498752                        9498181
mean   2020-07-01 05:41:17.225473536  2020-02-07 20:53:56.619558400
min              1971-06-12 00:00:00            1720-02-08 00:00:00
25%              2019-04-08 00:00:00            2018-12-28 00:00:00
50%              2020-07-21 00:00:00            2020-04-19 00:00:00
75%              2021-09-28 00:00:00            2021-07-14 00:00:00
max              2022-12-30 00:00:00            2220-03-07 00:00:00



In [9]:
df.head()

Unnamed: 0,vin,price,odometer_type,mileage,county,zip,model_year,make,model,vehicle_type,new_used,title_issue_date,purchase_date
0,\1FTRX07L53KD87737,0.0,1,0.0,State,37207,2003,FORD,F15,AUTO,U,2018-07-27,2018-06-04
1,0000000V464048305,0.0,1,0.0,State,37167,1995,MITS,PAJ,AUTO,U,2020-06-24,2020-05-13
2,0000161231,1800.0,1,0.0,State,37397,1955,CHEV,BEL,AUTO,U,2021-12-13,2021-12-08
3,0005400FKXX,0.0,1,0.0,Fentress,38553,1955,CHEV,310,AUTO,U,2019-03-07,2019-03-04
4,0096456,300.0,1,0.0,Blount,37804,1966,VOLK,BUG,AUTO,U,2022-11-22,2022-11-15


In [12]:
df[df.price!=0].shape

(3610711, 13)

In [15]:
df.make = df.make.apply(str.lower)

In [16]:
df.model = df.model.apply(str.lower)

In [31]:
df.vehicle_type.unique()

array(['AUTO', 'MOBILE HOME/HOUSE TR', 'FREIGHT/SEMI TRAILER',
       'UTILITY (BOX/CARGO) ', 'LIVESTOCK/HORSE TRAI', 'OTHER', 'TRUCK',
       'UTILITY (FLAT BED) T', 'MOTORCYCLE', 'CAMPER TRAILER',
       'WATERCRAFT TRAILER', 'BUS - COMMERCIAL', 'RECREATION/MOTOR HOM',
       'Multipurpose Vehicle', 'SCHOOL BUS', 'VAN'], dtype=object)

In [35]:
df = df[(df.vehicle_type=="AUTO") | \
        (df.vehicle_type=="TRUCK") | \
        (df.vehicle_type=="MOTORCYCLE") | \
        (df.vehicle_type=="BUS - COMMERCIAL") | \
        (df.vehicle_type=="Multipurpose Vehicle") | \
        (df.vehicle_type=="VAN") | \
        (df.vehicle_type=="SCHOOL BUS") | \
        (df.vehicle_type=="OTHER")]

In [41]:
df = df[df.price!=0]

In [42]:
df.shape

(3450442, 13)

In [77]:
from vininfo import Vin

In [96]:
def parse_vin_to_manufacturer(vin: str) -> str:
    try:
        make = Vin(vin).manufacturer
        if make == "UnsupportedBrand":
            return np.nan
        return make.lower()
    except:
        return np.nan

In [100]:
df.make = df.vin.apply(parse_vin_to_manufacturer)

In [104]:
df.shape

(3450442, 13)

In [103]:
df.isna().sum()

vin                      0
price                    0
odometer_type            0
mileage                  0
county                   0
zip                      0
model_year               0
make                170477
model                    0
vehicle_type             0
new_used                 0
title_issue_date         0
purchase_date           70
dtype: int64

In [105]:
df.dropna(inplace=True)

In [106]:
df.shape

(3279904, 13)

In [158]:
df[df.make=="western star"]

Unnamed: 0,vin,price,odometer_type,mileage,county,zip,model_year,make,model,vehicle_type,new_used,title_issue_date,purchase_date
3054300,2WKEDDXH1YK965488,12000.0,1,0.0,Wayne,38425,2000,western star,con,TRUCK,U,2022-01-05,2021-12-28
3054305,2WLPCCCH8PK931084,800.0,1,0.0,Sevier,37862,1993,western star,aa,TRUCK,U,2019-10-31,2019-10-19
3054306,2WLPCD2G9YK962182,54000.0,1,0.0,Washington,37601,2000,western star,490,TRUCK,U,2018-05-18,2018-05-01
3107634,2WKPDCCH3WK950724,15730.0,1,0.0,Washington,37601,1998,western star,aa,TRUCK,N,2019-01-28,2018-12-20
3107635,2WKPDDPF2WK950924,1000.0,1,0.0,Greene,37818,1998,western star,496,AUTO,U,2020-12-17,2020-10-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9567310,2WKEDDXJ71K966735,10000.0,1,0.0,Morgan,37887,2001,western star,aa,TRUCK,N,2019-04-01,2018-06-29
9567312,2WLNCCBE5RK935174,12500.0,1,200.0,Sumner,37066,1994,western star,480,TRUCK,U,2018-02-02,2017-12-04
9759425,2WKPDCCGXFK912172,3000.0,1,0.0,Blount,37882,1985,western star,490,TRUCK,U,2021-08-04,2021-06-10
9759429,2WLNCCCF6WK950611,25000.0,1,0.0,Wilson,37090,1998,western star,486,TRUCK,U,2018-03-08,2018-03-07


In [155]:
sorted(df.make.unique())[-5:]

['porsche suv',
 'quattro',
 'ram',
 'renault',
 'renault samsung',
 'roadrunner hay squeeze',
 'rolls royce',
 'rover',
 'ruf automobile',
 'saab',
 'saturn',
 'scion',
 'smart',
 'sterling',
 'subaru',
 'subaru-isuzu automotive',
 'suzuki',
 'suzuki / santana motors',
 'suzuki canada',
 'tesla',
 'toyota',
 'toyota mexico',
 'triumph',
 'volkswagen',
 'volkswagen commercial vehicles',
 'volkswagen trucks',
 'volkswagen usa (commercials)',
 'volvo',
 'volvo cars',
 'volvo china',
 'volvo eicher commercial vehicles limited.',
 'western star',
 'yamaha',
 'zastava / yugo']

In [None]:
df[df.model==]