In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("car_data.csv")


In [3]:
data.shape

(892, 6)

In [4]:
data.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [6]:
data.isna().sum()

name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64

In [7]:
data.duplicated().sum()

94

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.duplicated().sum()

0

In [10]:
data.isna().sum()

name           0
company        0
year           0
Price          0
kms_driven    50
fuel_type     53
dtype: int64

## examine the columns

In [11]:
data.name.unique()

array(['Hyundai Santro Xing XO eRLX Euro III', 'Mahindra Jeep CL550 MDI',
       'Maruti Suzuki Alto 800 Vxi',
       'Hyundai Grand i10 Magna 1.2 Kappa VTVT',
       'Ford EcoSport Titanium 1.5L TDCi', 'Ford Figo', 'Hyundai Eon',
       'Ford EcoSport Ambiente 1.5L TDCi',
       'Maruti Suzuki Alto K10 VXi AMT', 'Skoda Fabia Classic 1.2 MPI',
       'Maruti Suzuki Stingray VXi', 'Hyundai Elite i20 Magna 1.2',
       'Mahindra Scorpio SLE BS IV', 'Audi A8', 'Audi Q7',
       'Mahindra Scorpio S10', 'Maruti Suzuki Alto 800',
       'Hyundai i20 Sportz 1.2', 'Maruti Suzuki Alto 800 Lx',
       'Maruti Suzuki Vitara Brezza ZDi', 'Maruti Suzuki Alto LX',
       'Mahindra Bolero DI', 'Maruti Suzuki Swift Dzire ZDi',
       'Mahindra Scorpio S10 4WD', 'Maruti Suzuki Swift Vdi BSIII',
       'Maruti Suzuki Wagon R VXi BS III',
       'Maruti Suzuki Wagon R VXi Minor',
       'Toyota Innova 2.0 G 8 STR BS IV', 'Renault Lodgy 85 PS RXL',
       'Skoda Yeti Ambition 2.0 TDI CR 4x2',
       'Maru

### name col has some illogical names

In [12]:
data.name = data.name.str.split(" ").str[:3].str.join(" ")

In [13]:
data

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,"36,000 kms",Diesel
...,...,...,...,...,...,...
887,Ta,Tara,zest,310000,,
888,Tata Zest XM,Tata,2018,260000,"27,000 kms",Diesel
889,Mahindra Quanto C8,Mahindra,2013,390000,"40,000 kms",Diesel
890,Honda Amaze 1.2,Honda,2014,180000,Petrol,


## company

In [14]:
data.company.unique()

array(['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'I', 'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat',
       'Commercial', 'MARUTI', 'Force', 'Mercedes', 'Land', 'Yamaha',
       'selling', 'URJENT', 'Swift', 'Used', 'Jaguar', 'Jeep', 'tata',
       'Sale', 'very', 'Volvo', 'i', '2012', 'Well', 'all', '7', '9',
       'scratch', 'urgent', 'sell', 'TATA', 'Any', 'Tara'], dtype=object)

### year

In [15]:
data.year

0      2007
1      2006
2      2018
3      2014
4      2014
       ... 
887    zest
888    2018
889    2013
890    2014
891    2014
Name: year, Length: 798, dtype: object

In [16]:
data.year.astype(int)

ValueError: invalid literal for int() with base 10: '...'

In [17]:
data[data.year.str.isnumeric()]["year"].unique()

array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '2003', '2004', '1995', '2002', '2001'], dtype=object)

### year has some non numeric values we deleted all those values 

In [18]:
new_data = data[data.year.str.isnumeric()]

In [19]:
new_data.year = new_data.year.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data.year = new_data.year.astype(int)


In [20]:
new_data.year.unique()

array([2007, 2006, 2018, 2014, 2015, 2012, 2013, 2016, 2010, 2017, 2008,
       2011, 2019, 2009, 2005, 2000, 2003, 2004, 1995, 2002, 2001])

### price column

In [21]:
new_data.Price.unique()

array(['80,000', '4,25,000', 'Ask For Price', '3,25,000', '5,75,000',
       '1,75,000', '1,90,000', '8,30,000', '2,50,000', '1,82,000',
       '3,15,000', '4,15,000', '3,20,000', '10,00,000', '5,00,000',
       '3,50,000', '1,60,000', '3,10,000', '75,000', '1,00,000',
       '2,90,000', '95,000', '1,80,000', '3,85,000', '1,05,000',
       '6,50,000', '6,89,999', '4,48,000', '5,49,000', '5,01,000',
       '4,89,999', '2,80,000', '3,49,999', '2,84,999', '3,45,000',
       '4,99,999', '2,35,000', '2,49,999', '14,75,000', '3,95,000',
       '2,20,000', '1,70,000', '85,000', '2,00,000', '5,70,000',
       '1,10,000', '4,48,999', '18,91,111', '1,59,500', '3,44,999',
       '4,49,999', '8,65,000', '6,99,000', '3,75,000', '2,24,999',
       '12,00,000', '1,95,000', '3,51,000', '2,40,000', '90,000',
       '1,55,000', '6,00,000', '1,89,500', '2,10,000', '3,90,000',
       '1,35,000', '16,00,000', '7,01,000', '2,65,000', '5,25,000',
       '3,72,000', '6,35,000', '5,50,000', '4,85,000', '3,29,5

In [22]:
new_data[new_data.Price =="Ask For Price"]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
2,Maruti Suzuki Alto,Maruti,2018,Ask For Price,"22,000 kms",Petrol
5,Ford EcoSport Titanium,Ford,2015,Ask For Price,"59,000 kms",Diesel
69,I want to,I,2017,Ask For Price,,
138,Maruti Suzuki Alto,Maruti,2018,Ask For Price,"28,028 kms",Petrol
295,Maruti Suzuki Swift,Maruti,2010,Ask For Price,"52,000 kms",Diesel
304,Tata Indica eV2,Tata,2017,Ask For Price,"84,000 kms",Diesel
388,Maruti Suzuki Alto,Maruti,2018,Ask For Price,"24,000 kms",Petrol
449,Maruti Suzuki Zen,Maruti,2011,Ask For Price,"16,000 kms",Petrol
503,Hyundai Xcent Base,Hyundai,2015,Ask For Price,"1,80,000 kms",Diesel
511,Hyundai Xcent S,Hyundai,2015,Ask For Price,"35,000 kms",Petrol


In [23]:
new_data[new_data.Price =="Ask For Price"].shape

(22, 6)

In [24]:
askForPriceRecords = new_data[new_data.Price =="Ask For Price"]

In [25]:
new_data = new_data[new_data.Price !="Ask For Price"]

In [26]:
new_data.Price.unique()

array(['80,000', '4,25,000', '3,25,000', '5,75,000', '1,75,000',
       '1,90,000', '8,30,000', '2,50,000', '1,82,000', '3,15,000',
       '4,15,000', '3,20,000', '10,00,000', '5,00,000', '3,50,000',
       '1,60,000', '3,10,000', '75,000', '1,00,000', '2,90,000', '95,000',
       '1,80,000', '3,85,000', '1,05,000', '6,50,000', '6,89,999',
       '4,48,000', '5,49,000', '5,01,000', '4,89,999', '2,80,000',
       '3,49,999', '2,84,999', '3,45,000', '4,99,999', '2,35,000',
       '2,49,999', '14,75,000', '3,95,000', '2,20,000', '1,70,000',
       '85,000', '2,00,000', '5,70,000', '1,10,000', '4,48,999',
       '18,91,111', '1,59,500', '3,44,999', '4,49,999', '8,65,000',
       '6,99,000', '3,75,000', '2,24,999', '12,00,000', '1,95,000',
       '3,51,000', '2,40,000', '90,000', '1,55,000', '6,00,000',
       '1,89,500', '2,10,000', '3,90,000', '1,35,000', '16,00,000',
       '7,01,000', '2,65,000', '5,25,000', '3,72,000', '6,35,000',
       '5,50,000', '4,85,000', '3,29,500', '2,51,111', 

In [27]:
new_data.Price = new_data.Price.str.replace(",","").astype(int)

In [28]:
new_data.Price

0       80000
1      425000
3      325000
4      575000
6      175000
        ...  
886    300000
888    260000
889    390000
890    180000
891    160000
Name: Price, Length: 727, dtype: int32

### Kilometer driven column

In [29]:
new_data.kms_driven

0        45,000 kms
1            40 kms
3        28,000 kms
4        36,000 kms
6        41,000 kms
           ...     
886    1,32,000 kms
888      27,000 kms
889      40,000 kms
890          Petrol
891          Petrol
Name: kms_driven, Length: 727, dtype: object

### remove values "Petrol"

In [30]:
new_data = new_data[new_data.kms_driven != "Petrol"]

In [31]:
new_data.kms_driven

0        45,000 kms
1            40 kms
3        28,000 kms
4        36,000 kms
6        41,000 kms
           ...     
883      50,000 kms
885      30,000 kms
886    1,32,000 kms
888      27,000 kms
889      40,000 kms
Name: kms_driven, Length: 725, dtype: object

In [32]:
new_data.kms_driven = new_data.kms_driven.str.replace("kms","").str.replace(",","")

In [33]:
new_data

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
885,Tata Indica V2,Tata,2009,110000,30000,Diesel
886,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
888,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [34]:
new_data.isna().sum()

name          0
company       0
year          0
Price         0
kms_driven    0
fuel_type     1
dtype: int64

In [35]:
new_data.dropna(inplace=True)

### setting mean of all companies mean to Price of ask for price records

In [36]:
askForPriceRecords

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
2,Maruti Suzuki Alto,Maruti,2018,Ask For Price,"22,000 kms",Petrol
5,Ford EcoSport Titanium,Ford,2015,Ask For Price,"59,000 kms",Diesel
69,I want to,I,2017,Ask For Price,,
138,Maruti Suzuki Alto,Maruti,2018,Ask For Price,"28,028 kms",Petrol
295,Maruti Suzuki Swift,Maruti,2010,Ask For Price,"52,000 kms",Diesel
304,Tata Indica eV2,Tata,2017,Ask For Price,"84,000 kms",Diesel
388,Maruti Suzuki Alto,Maruti,2018,Ask For Price,"24,000 kms",Petrol
449,Maruti Suzuki Zen,Maruti,2011,Ask For Price,"16,000 kms",Petrol
503,Hyundai Xcent Base,Hyundai,2015,Ask For Price,"1,80,000 kms",Diesel
511,Hyundai Xcent S,Hyundai,2015,Ask For Price,"35,000 kms",Petrol


In [37]:
mean_price = new_data.groupby("company")["Price"].mean().mean()
askForPriceRecords.replace("Ask For Price",mean_price,inplace=True)
askForPriceRecords

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
2,Maruti Suzuki Alto,Maruti,2018,805131.758649,"22,000 kms",Petrol
5,Ford EcoSport Titanium,Ford,2015,805131.758649,"59,000 kms",Diesel
69,I want to,I,2017,805131.758649,,
138,Maruti Suzuki Alto,Maruti,2018,805131.758649,"28,028 kms",Petrol
295,Maruti Suzuki Swift,Maruti,2010,805131.758649,"52,000 kms",Diesel
304,Tata Indica eV2,Tata,2017,805131.758649,"84,000 kms",Diesel
388,Maruti Suzuki Alto,Maruti,2018,805131.758649,"24,000 kms",Petrol
449,Maruti Suzuki Zen,Maruti,2011,805131.758649,"16,000 kms",Petrol
503,Hyundai Xcent Base,Hyundai,2015,805131.758649,"1,80,000 kms",Diesel
511,Hyundai Xcent S,Hyundai,2015,805131.758649,"35,000 kms",Petrol


In [38]:
askForPriceRecords.dropna(inplace=True)
askForPriceRecords

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
2,Maruti Suzuki Alto,Maruti,2018,805131.758649,"22,000 kms",Petrol
5,Ford EcoSport Titanium,Ford,2015,805131.758649,"59,000 kms",Diesel
138,Maruti Suzuki Alto,Maruti,2018,805131.758649,"28,028 kms",Petrol
295,Maruti Suzuki Swift,Maruti,2010,805131.758649,"52,000 kms",Diesel
304,Tata Indica eV2,Tata,2017,805131.758649,"84,000 kms",Diesel
388,Maruti Suzuki Alto,Maruti,2018,805131.758649,"24,000 kms",Petrol
449,Maruti Suzuki Zen,Maruti,2011,805131.758649,"16,000 kms",Petrol
503,Hyundai Xcent Base,Hyundai,2015,805131.758649,"1,80,000 kms",Diesel
511,Hyundai Xcent S,Hyundai,2015,805131.758649,"35,000 kms",Petrol
524,Toyota Innova 2.0,Toyota,2009,805131.758649,"15,574 kms",Diesel


In [39]:
askForPriceRecords.kms_driven =  askForPriceRecords.kms_driven.str.replace("kms","").str.replace(",","")

In [40]:
ready_data = pd.concat([new_data,askForPriceRecords])

In [41]:
ready_data

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000.000000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000.000000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000.000000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000.000000,36000,Diesel
6,Ford Figo,Ford,2012,175000.000000,41000,Diesel
...,...,...,...,...,...,...
826,Renault Lodgy,Renault,2016,805131.758649,20000,Diesel
848,Maruti Suzuki Alto,Maruti,2019,805131.758649,1500,Petrol
854,Volkswagen Vento Highline,Volkswagen,2015,805131.758649,38900,Diesel
859,Toyota Innova 2.0,Toyota,2019,805131.758649,4000,Petrol


In [42]:
ready_data.reset_index(drop=True,inplace=True)

In [43]:
ready_data.to_excel("cleaned_data.xlsx")