In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
cars_data = pd.read_csv('C:\ML Project\German cars\germany_cars.csv')

In [5]:
cars_data.head()

Unnamed: 0,mileage,make,model,fuel,gear,offerType,price,hp,year
0,235000,BMW,316,Diesel,Manual,Used,6800,116.0,2011
1,92800,Volkswagen,Golf,Gasoline,Manual,Used,6877,122.0,2011
2,149300,SEAT,Exeo,Gasoline,Manual,Used,6900,160.0,2011
3,96200,Renault,Megane,Gasoline,Manual,Used,6950,110.0,2011
4,156000,Peugeot,308,Gasoline,Manual,Used,6950,156.0,2011


In [6]:
cars_data.isnull().sum()

mileage        0
make           0
model        143
fuel           0
gear         182
offerType      0
price          0
hp            29
year           0
dtype: int64

In [None]:
cars_data.dropna(inplace=True)                #its job is to remove rows or columns that contain any NaN (Not a Number/Missing) values.

In [8]:
cars_data.shape

(46071, 9)

In [9]:
#Duplicate Check
cars_data.duplicated().sum()

np.int64(2124)

In [10]:
cars_data.drop_duplicates(inplace=True)

In [11]:
cars_data.shape

(43947, 9)

In [12]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43947 entries, 0 to 46399
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   mileage    43947 non-null  int64  
 1   make       43947 non-null  object 
 2   model      43947 non-null  object 
 3   fuel       43947 non-null  object 
 4   gear       43947 non-null  object 
 5   offerType  43947 non-null  object 
 6   price      43947 non-null  int64  
 7   hp         43947 non-null  float64
 8   year       43947 non-null  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 3.4+ MB


In [13]:
#Data Analysis
for col in cars_data.columns:
    print('Unique values of ' + col)
    print(cars_data[col].unique())
    print("======================")

Unique values of mileage
[235000  92800 149300 ...   7612    550    837]
Unique values of make
['BMW' 'Volkswagen' 'SEAT' 'Renault' 'Peugeot' 'Toyota' 'Opel' 'Mazda'
 'Ford' 'Mercedes-Benz' 'Chevrolet' 'Audi' 'Fiat' 'Kia' 'Dacia' 'MINI'
 'Hyundai' 'Skoda' 'Citroen' 'Infiniti' 'Suzuki' 'SsangYong' 'smart'
 'Cupra' 'Volvo' 'Jaguar' 'Porsche' 'Nissan' 'Honda' 'Mitsubishi' 'Lexus'
 'Jeep' 'Maserati' 'Bentley' 'Land' 'Alfa' 'Subaru' 'Dodge' 'Microcar'
 'Lamborghini' 'Lada' 'Tesla' 'Chrysler' 'McLaren' 'Aston' 'Rolls-Royce'
 'Lancia' 'Abarth' 'DS' 'Daihatsu' 'Ligier' 'Ferrari' 'Aixam' 'Zhidou'
 'Morgan' 'Maybach' 'RAM' 'Alpina' 'Polestar' 'Brilliance' 'Piaggio'
 'FISKER' 'Others' 'Cadillac' 'Iveco' 'Isuzu' 'Corvette' 'Baic' 'DFSK'
 'Estrima' 'Alpine']
Unique values of model
['316' 'Golf' 'Exeo' 'Megane' '308' 'Auris' 'Scenic' 'Zafira' '3'
 'Transit' 'Meriva' 'E 250' 'Orlando' 'A4' 'Polo' 'Espace' 'Grand Espace'
 'Sedici' 'Corsa' 'Picanto' 'Duster' 'Cross Touran' 'Beetle'
 'Golf Cabriolet' '1

In [14]:
def get_brand_name(car_name):
    car_name = car_name.split(' ')[0]
    return car_name.strip()

In [15]:
def clean_data(value):
    value = value.split(' ')[0]
    value = value.strip()
    if value == '':
        value = 0
    return float(value)

In [16]:
get_brand_name('Maruti  Swift Dzire VDI')

'Maruti'

In [17]:
cars_data['make'] = cars_data['make'].apply(get_brand_name)

In [18]:
for col in cars_data.columns:
    print('Unique values of ' + col)
    print(cars_data[col].unique())
    print("======================")

Unique values of mileage
[235000  92800 149300 ...   7612    550    837]
Unique values of make
['BMW' 'Volkswagen' 'SEAT' 'Renault' 'Peugeot' 'Toyota' 'Opel' 'Mazda'
 'Ford' 'Mercedes-Benz' 'Chevrolet' 'Audi' 'Fiat' 'Kia' 'Dacia' 'MINI'
 'Hyundai' 'Skoda' 'Citroen' 'Infiniti' 'Suzuki' 'SsangYong' 'smart'
 'Cupra' 'Volvo' 'Jaguar' 'Porsche' 'Nissan' 'Honda' 'Mitsubishi' 'Lexus'
 'Jeep' 'Maserati' 'Bentley' 'Land' 'Alfa' 'Subaru' 'Dodge' 'Microcar'
 'Lamborghini' 'Lada' 'Tesla' 'Chrysler' 'McLaren' 'Aston' 'Rolls-Royce'
 'Lancia' 'Abarth' 'DS' 'Daihatsu' 'Ligier' 'Ferrari' 'Aixam' 'Zhidou'
 'Morgan' 'Maybach' 'RAM' 'Alpina' 'Polestar' 'Brilliance' 'Piaggio'
 'FISKER' 'Others' 'Cadillac' 'Iveco' 'Isuzu' 'Corvette' 'Baic' 'DFSK'
 'Estrima' 'Alpine']
Unique values of model
['316' 'Golf' 'Exeo' 'Megane' '308' 'Auris' 'Scenic' 'Zafira' '3'
 'Transit' 'Meriva' 'E 250' 'Orlando' 'A4' 'Polo' 'Espace' 'Grand Espace'
 'Sedici' 'Corsa' 'Picanto' 'Duster' 'Cross Touran' 'Beetle'
 'Golf Cabriolet' '1

In [17]:
# Automatic way (assigns 1, 2, 3... automatically)
cars_data['make'] = cars_data['make'].astype('category').cat.codes + 1
print(cars_data['make'].unique())

[ 8 68 61 59 54 67 52 45 27 47 13  7 26 35 20 42 29 62 15 30 65 63 71 17
 69 33 57 51 28 49 40 34 43 10 39  3 64 22 48 37 36 66 14 46  6 60 38  1
 19 21 41 25  2 70 50 44 58  4 56 11 55 24 53 12 32 31 16  9 18 23  5]


In [18]:
# Automatic way: Assigns 1, 2, 3... to the 'model' column
cars_data['model'] = cars_data['model'].astype('category').cat.codes + 1
print(cars_data['model'].unique())

[ 34 395 323 507  33 135 675 815  27 737 508 300 548 105 563 321 409 678
 275 561 297 278 158 396   5 133 338 546 250 146 433 540  36 834 117  63
 150 401 730 547   3 387 474 102 744 767 399 334 774 808 833 835 612   4
 558 231 333 429 177 163 693 187  17 587 696 461 672  13 683 362 108 303
 824 510 239 144 577 825 823 109 106 363 580 725 134 125 583 667 586 482
 797 784 782 698 773 462 455 355 120 602 716 329 497 729 110 649 707 588
 553  61 589 241 555 141 538 122 519 437 829 405  14 670 436 826 809 626
 766 286 225 242 185 615 677 575 220 814 181 238 509 747 447 685  55 316
 417 801 556 245 301 305 609 795 604 207 328 130 317 576 828 536  15  16
 104 292 232 416 298 479 579 516  40 644 741  67 352  95  49  68 710 299
 169 170 101 646 365 764 754 107 318 760 398 434 168 223 746 720 714 217
 280 734 246 340  56 610 686  96 281   2 560 369 647 403 247 817 459 253
  20 176 260 111 668 601 307 229 381 794 457 218 131 680  30 426 756 755
 713 651 712 366 526 404 765 648 127 276 676 757  2

In [19]:
# 1. Fuel Mapping
fuel_mapping = {
    'Diesel': 1, 'Gasoline': 2, 'Electric/Gasoline': 3, '-/- (Fuel)': 4, 'Electric': 5,
    'Electric/Diesel': 6, 'CNG': 7, 'LPG': 8, 'Others': 9, 'Hydrogen': 10, 'Ethanol': 11
}
cars_data.replace({'fuel': fuel_mapping}, inplace=True)

# 2. Gear Mapping
gear_mapping = {
    'Manual': 1, 'Automatic': 2, 'Semi-automatic': 3
}
cars_data.replace({'gear': gear_mapping}, inplace=True)

# 3. Offer Type Mapping
offer_mapping = {
    'Used': 1, 'Demonstration': 2, "Employee's car": 3, 'Pre-registered': 4, 'New': 5
}
cars_data.replace({'offerType': offer_mapping}, inplace=True)

# Verify the changes
print("Fuel types:", cars_data['fuel'].unique())
print("Gears:", cars_data['gear'].unique())
print("Offers:", cars_data['offerType'].unique())

Fuel types: [ 1  2  3  4  5  6  7  8  9 10 11]
Gears: [1 2 3]
Offers: [1 2 3 4 5]


  cars_data.replace({'fuel': fuel_mapping}, inplace=True)
  cars_data.replace({'gear': gear_mapping}, inplace=True)
  cars_data.replace({'offerType': offer_mapping}, inplace=True)


In [20]:
for col in cars_data.columns:
    print('Unique values of ' + col)
    print(cars_data[col].unique())
    print("======================")

Unique values of mileage
[235000  92800 149300 ...   7612    550    837]
Unique values of make
[ 8 68 61 59 54 67 52 45 27 47 13  7 26 35 20 42 29 62 15 30 65 63 71 17
 69 33 57 51 28 49 40 34 43 10 39  3 64 22 48 37 36 66 14 46  6 60 38  1
 19 21 41 25  2 70 50 44 58  4 56 11 55 24 53 12 32 31 16  9 18 23  5]
Unique values of model
[ 34 395 323 507  33 135 675 815  27 737 508 300 548 105 563 321 409 678
 275 561 297 278 158 396   5 133 338 546 250 146 433 540  36 834 117  63
 150 401 730 547   3 387 474 102 744 767 399 334 774 808 833 835 612   4
 558 231 333 429 177 163 693 187  17 587 696 461 672  13 683 362 108 303
 824 510 239 144 577 825 823 109 106 363 580 725 134 125 583 667 586 482
 797 784 782 698 773 462 455 355 120 602 716 329 497 729 110 649 707 588
 553  61 589 241 555 141 538 122 519 437 829 405  14 670 436 826 809 626
 766 286 225 242 185 615 677 575 220 814 181 238 509 747 447 685  55 316
 417 801 556 245 301 305 609 795 604 207 328 130 317 576 828 536  15  16
 104 292

In [21]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43947 entries, 0 to 46399
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   mileage    43947 non-null  int64  
 1   make       43947 non-null  int8   
 2   model      43947 non-null  int16  
 3   fuel       43947 non-null  int64  
 4   gear       43947 non-null  int64  
 5   offerType  43947 non-null  int64  
 6   price      43947 non-null  int64  
 7   hp         43947 non-null  float64
 8   year       43947 non-null  int64  
dtypes: float64(1), int16(1), int64(6), int8(1)
memory usage: 2.8 MB


In [22]:
cars_data

Unnamed: 0,mileage,make,model,fuel,gear,offerType,price,hp,year
0,235000,8,34,1,1,1,6800,116.0,2011
1,92800,68,395,2,1,1,6877,122.0,2011
2,149300,61,323,2,1,1,6900,160.0,2011
3,96200,59,507,2,1,1,6950,110.0,2011
4,156000,54,33,2,1,1,6950,156.0,2011
...,...,...,...,...,...,...,...,...,...
46394,10,15,179,2,1,4,12340,72.0,2021
46396,99,26,55,3,1,4,12490,71.0,2021
46397,550,26,55,3,1,2,12805,69.0,2021
46398,837,26,553,3,1,2,12805,69.0,2021


In [24]:
for col in cars_data.columns:
    print('------------')
    print(col)
    print(cars_data[col].unique())

------------
mileage
[235000  92800 149300 ...   7612    550    837]
------------
make
[ 8 68 61 59 54 67 52 45 27 47 13  7 26 35 20 42 29 62 15 30 65 63 71 17
 69 33 57 51 28 49 40 34 43 10 39  3 64 22 48 37 36 66 14 46  6 60 38  1
 19 21 41 25  2 70 50 44 58  4 56 11 55 24 53 12 32 31 16  9 18 23  5]
------------
model
[ 34 395 323 507  33 135 675 815  27 737 508 300 548 105 563 321 409 678
 275 561 297 278 158 396   5 133 338 546 250 146 433 540  36 834 117  63
 150 401 730 547   3 387 474 102 744 767 399 334 774 808 833 835 612   4
 558 231 333 429 177 163 693 187  17 587 696 461 672  13 683 362 108 303
 824 510 239 144 577 825 823 109 106 363 580 725 134 125 583 667 586 482
 797 784 782 698 773 462 455 355 120 602 716 329 497 729 110 649 707 588
 553  61 589 241 555 141 538 122 519 437 829 405  14 670 436 826 809 626
 766 286 225 242 185 615 677 575 220 814 181 238 509 747 447 685  55 316
 417 801 556 245 301 305 609 795 604 207 328 130 317 576 828 536  15  16
 104 292 232 416 298

In [25]:
cars_data.isnull().sum()

mileage      0
make         0
model        0
fuel         0
gear         0
offerType    0
price        0
hp           0
year         0
dtype: int64

In [26]:
input_data = cars_data.drop(columns=['price'])
output_data =cars_data['price']
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

In [29]:
#model Creation
model = LinearRegression()

In [30]:
#Train MOdel
model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [31]:
predict = model.predict(x_test)

In [32]:
predict

array([16531.63927092,  6227.59733183, 17799.07052729, ...,
       11714.71323727, 24568.83390447,  7075.94948649], shape=(8790,))

In [33]:
x_train.head(1)

Unnamed: 0,mileage,make,model,fuel,gear,offerType,hp,year
6168,100200,27,338,2,1,1,125.0,2013


In [34]:
input_data_model = pd.DataFrame(
    [[5, 120, 2022, 12000, 100.6, 1, 1, 1]],
    columns=['mileage','make', 'model','fuel', 'gear', 'offerType', 'hp', 'year' ]
)

In [35]:
input_data_model

Unnamed: 0,mileage,make,model,fuel,gear,offerType,hp,year
0,5,120,2022,12000,100.6,1,1,1


In [36]:
model.predict(input_data_model)

array([-2539518.46531392])

In [37]:
model.predict(input_data_model)

array([-2539518.46531392])

In [38]:
import pickle as pk

In [39]:
pk.dump(model,open('model.pkl','wb'))