In [1]:
import pandas as pd 
import numpy as np
import streamlit as st 
import seaborn as sns

In [8]:
df=pd.read_csv('new_data_cars.csv')

In [9]:
df.head()

Unnamed: 0,manufacturer,model,year,mileage,engine,transmission,drivetrain,fuel_type,mpg,exterior_color,interior_color,accidents_or_damage,one_owner,personal_use_only,seller_rating,driver_rating,driver_reviews_num,price,age
0,Acura,Ilx Hybrid 1.5L,2013,53422.0,1.5L I4 8V Mpfi Sohc Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,38.5,Bellanova White Pearl,Ebony,0.0,1.0,1.0,4.3,4.4,12.0,17000.0,10
1,Acura,Ilx Hybrid 1.5L,2013,62042.0,1.5L I4 8V Mpfi Sohc Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,38.5,Polished Metal Metallic,Ebony,0.0,0.0,1.0,2.2,4.4,12.0,18000.0,10
2,Acura,Ilx Hybrid 1.5L,2013,57212.0,1.5L I4 8V Mpfi Sohc Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,38.5,Silver,Ebony,0.0,1.0,1.0,2.2,4.4,12.0,15999.0,10
3,Acura,Zdx Base,2012,148302.0,3.7L V6 24V Mpfi Sohc,6-Speed Automatic,All-wheel Drive,Gasoline,19.5,Other,Taupe,1.0,0.0,1.0,5.0,4.7,4.0,14491.0,11
4,Acura,Zdx Base,2012,110300.0,3.7L V6 24V Mpfi Sohc,6-Speed Automatic,All-wheel Drive,Gasoline,19.5,Black,Tan,0.0,0.0,1.0,1.8,4.7,4.0,15995.0,11


In [5]:
from category_encoders       import BinaryEncoder
from sklearn.pipeline        import make_pipeline
from sklearn.preprocessing   import StandardScaler,PolynomialFeatures,MinMaxScaler
from sklearn.compose         import ColumnTransformer
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV
from sklearn.linear_model    import LinearRegression
from sklearn.tree            import DecisionTreeRegressor
from sklearn.ensemble        import RandomForestRegressor
from sklearn.svm             import SVR
from sklearn.neighbors       import kneighbors_graph
from sklearn.metrics         import r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble        import GradientBoostingRegressor
from sklearn.neighbors       import KNeighborsRegressor
from sklearn.decomposition   import PCA

# preprocessing

In [6]:
df.drop(columns=['age','seller_rating','driver_rating','driver_reviews_num'],inplace=True)

In [8]:
X=df.drop(columns='price')
y=df['price']

In [9]:
binary=['manufacturer','model','transmission','drivetrain','fuel_type']                                

In [10]:
for i in df.select_dtypes('O').columns:
    print(i,df[i].nunique())

manufacturer 30
model 7918
engine 726
transmission 50
drivetrain 7
fuel_type 17
exterior_color 613
interior_color 246


In [11]:
transformer = ColumnTransformer(transformers=[
    ('Binary',BinaryEncoder(),df.select_dtypes("O").columns)
    ],remainder='passthrough')
transformer

In [13]:
model = make_pipeline(transformer,StandardScaler(),LinearRegression())
decision = make_pipeline(transformer,StandardScaler(),DecisionTreeRegressor(max_depth=80,splitter='random',min_samples_leaf=20))
svm =make_pipeline(transformer,StandardScaler(),SVR())
gradint = make_pipeline(transformer,StandardScaler(),GradientBoostingRegressor())
random = make_pipeline(transformer,StandardScaler(),RandomForestRegressor(n_estimators=10,max_depth=80))
knn = make_pipeline(transformer,StandardScaler(),KNeighborsRegressor())

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
model.fit(X,y)


In [16]:
print(f'actual value --> {y[0] }')
print(f'predicted value --> {model.predict(X)[0] } ')
print('-'*40,'\n')
print(f'model score {model.score(X,y) }')
print('-'*40,'\n')
train = model.predict(X_train)
test = model.predict(X_test)

print(f'r2_score train --> {r2_score(y_train,train) }')
print(f'r2_score test -->  {r2_score(y_test,test) } ')

actual value --> 17000.0
predicted value --> 18783.22302742166 
---------------------------------------- 

model score 0.7070845349586035
---------------------------------------- 

r2_score train --> 0.7074237028784742
r2_score test -->  0.7063928345715478 


In [17]:
decision.fit(X,y)


In [18]:
print(f'actual value --> {y[5] }')
print(f'predicted value --> {decision.predict(X)[5] } ')
print('-'*40,'\n')
print(f'model score {decision.score(X,y) }')
print('-'*40,'\n')
train = decision.predict(X_train)
test = decision.predict(X_test)

print(f'r2_score train -->{r2_score(y_train,train) }')
print(f'r2_score test --> {r2_score(y_test,test) } ')

actual value --> 14950.0
predicted value --> 14931.6 
---------------------------------------- 

model score 0.9200953098340031
---------------------------------------- 

r2_score train -->0.9207106267243337
r2_score test --> 0.9188452090206434 


In [19]:
random.fit(X,y)

In [20]:
print(f'actual value --> {y[0] }')
print(f'predicted value --> {random.predict(X)[0] } ')
print('-'*40,'\n')
print(f'model score {random.score(X,y) }')
print('-'*40,'\n')
train = random.predict(X_train)
test = random.predict(X_test)

print(f'r2_score train -->{r2_score(y_train,train) }')
print(f'r2_score test --> {r2_score(y_test,test) } ')

actual value --> 17000.0
predicted value --> 15588.5 
---------------------------------------- 

model score 0.9884026400622691
---------------------------------------- 

r2_score train -->0.988424627201518
r2_score test --> 0.9883578776375349 


In [21]:
import joblib 


In [25]:
joblib.dump(random,'random.pkl')
joblib.dump(decision,'decision.pkl')
joblib.dump(X['manufacturer'].unique(),'manufacturer.pkl')
joblib.dump(X['model'].unique(),'model.pkl')

joblib.dump(X['engine'].unique(),'engine.pkl')
joblib.dump(X['transmission'].unique(),'transmission.pkl')

joblib.dump(X['drivetrain'].unique(),'drivetrain.pkl')
joblib.dump(X['fuel_type'].unique(),'fuel_type.pkl')

joblib.dump(X['exterior_color'].unique(),'exterior_color.pkl')
joblib.dump(X['interior_color'].unique(),'interior_color.pkl')
joblib.dump(X.columns,'columns.pkl')

['columns.pkl']