### ADD TITLE AND DESCRIPTION

In [5]:
# Loading all the libraries
from matplotlib import pyplot as plt
from math import factorial
from scipy import stats as st
import numpy as np
import pandas as pd
import plotly.express as px

In [6]:
# Load the data files into different DataFrames
vehicles = pd.read_csv('../vehicles_us.csv')
vehicles.info()
display(vehicles)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71


### Clean and Fix the Data

In [19]:
# Clean / Fix the data
# Convert 'date_posted' column into datetime type
vehicles['date_posted'] = pd.to_datetime(vehicles['date_posted'], format='%Y-%m-%d')

# Fill in missing values on 'model_year', 'odometer', 'is_4wd', and 'cylinders'
# with 0 and convert them into type int
vehicles['model_year'] = vehicles['model_year'].fillna(0)
if np.array_equal(vehicles['model_year'], vehicles['model_year'].astype('int')):
    vehicles['model_year'] = vehicles['model_year'].astype(int)
vehicles['odometer'] = vehicles['odometer'].fillna(0)
if np.array_equal(vehicles['odometer'], vehicles['odometer'].astype('int')):
    vehicles['odometer'] = vehicles['odometer'].astype(int)
vehicles['is_4wd'] = vehicles['is_4wd'].fillna(0)
if np.array_equal(vehicles['is_4wd'], vehicles['is_4wd'].astype('int')):
    vehicles['is_4wd'] = vehicles['is_4wd'].astype(int)
vehicles['cylinders'] = vehicles['cylinders'].fillna(0)
if np.array_equal(vehicles['cylinders'], vehicles['cylinders'].astype('int')):
    vehicles['cylinders'] = vehicles['cylinders'].astype(int)

# Fill in missing paint_color values with 'unknown'
vehicles['paint_color'] = vehicles['paint_color'].fillna('unknown')

display(vehicles.sample(5))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
23000,3200,1997,ford f-150,excellent,0,gas,199,manual,pickup,red,0,2018-07-14,27
778,6995,2010,chrysler town & country,excellent,6,gas,83450,automatic,mini-van,silver,0,2018-06-09,11
6081,24983,0,ram 3500,good,6,diesel,109216,automatic,truck,unknown,1,2018-10-21,14
16068,29900,2018,ford mustang gt coupe 2d,good,8,gas,3827,automatic,coupe,yellow,0,2018-10-15,58
20708,29900,2014,ram 2500,good,6,diesel,94323,automatic,truck,blue,1,2018-09-06,70
