In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats
from matplotlib.dates import * 
from matplotlib.ticker import * 
from sklearn import metrics 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

In [None]:
ds=pd.read_csv('/kaggle/input/car-data/cardata.csv')
df=pd.DataFrame(ds)
df

In [None]:
df.describe()

In [None]:
# now we change year column with age of car which is 2019-year which shows how old is the car 
df['Year']=df.Year.apply(lambda x : max(df.Year)+1-x)
df.rename(columns={'Year':'Age'},inplace=True)
df

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x='Present_Price',y='Selling_Price',data=df)

In [None]:
# we can discover from chart above that one data is far distant from others 
# and there is a very huge difference between its present price so its and outlier

In [None]:
df[df['Selling_Price']>30]

In [None]:
#we remove this from this from our linear model beacuse its an outlier 
df.drop([86],inplace=True)

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x='Kms_Driven',y='Selling_Price',data=df)

In [None]:
#we can see from chart above that there is one data far distant from others
#most of the kms driven values are in rage (0,200000) but there is one data wiht 500000 kms driven value 
#so this is again an outlier so we have to remove this from our linear model 

In [None]:
df[df['Kms_Driven']>400000]

In [None]:
df.drop(196,inplace=True)

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x='Kms_Driven',y='Selling_Price',data=df)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(list(sorted(df['Kms_Driven'])),df['Selling_Price'],color='blue')
plt.xlabel('Kms_Driven')
plt.ylabel('Selling_Price')
plt.grid()

In [None]:
# we can discover from chart above  that the form of kms driven values and selling price is sinusoidal
# this will help us in our linear model

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x='Present_Price',y='Selling_Price',data=df,hue='Fuel_Type')

In [None]:
# from chart above we can discover diesel cars price are higher than petrol cars 


In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x='Present_Price',y='Selling_Price',data=df,hue='Seller_Type')

In [None]:
# we can discover from chart above that cars with seller type dealer are more expensive than 
# individual seller type 

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x='Present_Price',y='Selling_Price',data=df,hue='Transmission')

In [None]:
# we can disvoer from chart above that automatic cars are more expensive than manual cars 

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x='Present_Price',y='Selling_Price',data=df,hue=df['Owner'],)

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(x='Fuel_Type',y='Selling_Price',data=df)

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(x='Seller_Type',y='Selling_Price',data=df)

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(x='Transmission',y='Selling_Price',data=df)

In [None]:
#now we need to convert categorical features to numerical
#numbers of categorical feature are given in a sort that shows the positive effect of that feature on
#on target for example in feature fuel type because diesel car prices are higher we give disel the
#highest number and beacuse the avg price of cng and petrol cars are similar but petrol is higher we
#give petrol number 1 and CNG 0.8 
# because cars which seller type is more expensive than cars which their seller type is individual we 
# give individual 1 and dealer 2 
# because automatic cars are more expensive than manual cars we give automatic 2 and manual 1 
a={"Petrol":1,"CNG":0.8,"Diesel":2}
b={"Dealer":2,"Individual":1}
c={"Manual":1,"Automatic":2}
df['Fuel_Type'].replace(a,inplace=True)
df['Seller_Type'].replace(b,inplace=True)
df['Transmission'].replace(c,inplace=True)

In [None]:
#now we see the correlation of our features with target wich is selling price 
sns.heatmap(df.corr()[['Selling_Price']].sort_values(by='Selling_Price', ascending=False), annot = True,cmap='RdBu', vmin=-1, vmax=1)
plt.tight_layout()

In [None]:
# from correlation heat map we can conclude that present price has a direct effect on target and its 
# the most important feature.we can also see that age has a negative effect on target so the more the
# age of car the less the price gonna be 

In [None]:
# now we add 2 features (Dot features and sin kms driven )
# dot features is merge of our categorical features and its value is multiply of features 'Fuel_Type',
# 'Seller_Type' and 'Transmission'. the logic behind this is that when car fuel type is diesel for example 
# the price is higher than when its petrol and if the car trasmisson is automatic the pirce is higher
# than when the transmisson is manual,but what if one car is both diesel and aoutomatic and the other 
# car is both manuual and petrol. in this case difference between this 2 cars prices are gonna be 
# higher than when comapring one feature with another so it would have an additional effect.
# now by adding dot features we can show these additonal differences becuse when one car is petrol,
# differences becuse when one cars fuel type is petrol,seller type is individual and transmisson is 
# manual dot features value is 1 but when cars fuel type is diesel,seeler type is dealer and transmission 
# is automatic the dot features value is 8 which shows the effect those 3 features at the same time to 
# cover the addditonal difference.
# we add sin kms driven to our model because we saw above that form of kms driven and taget values
# was sinusoidal
df2=df.copy()
a=list(zip(df['Fuel_Type'],df['Seller_Type'],df['Transmission']))
b=[]
for items in a : 
    z=1 
    for i in items :
        z=z*i 
    b.append(z)
df2['dot features ']= b 
df2['sin kms drive']=df2['Kms_Driven'].apply(lambda x : np.sin(np.pi * x))

In [None]:
# now we normalize our features and scale them is range (0,1)
# the code commented can also be used to normalize our features 
# we also change the oder of oure feayres and put taget in last column of datafran \
# we also remove car name column from our linear model 

# scaler=MinMaxScaler(feature_range=(0,1))
# df2=pd.DataFrame(df2,columns=['Age', 'Present_Price', 'Kms_Driven',
#        'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner', 'dot features ',
#        'sin kms drive','Selling_Price'])
# df2[['Age', 'Present_Price', 'Kms_Driven',
#        'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner', 'dot features ',
#        'sin kms drive']]=scaler.fit_transform(df2[['Age', 'Present_Price', 'Kms_Driven',
#        'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner', 'dot features ',
#        'sin kms drive']].values)

scaler=MinMaxScaler()
c=list(df2.columns)
c.remove('Selling_Price')
c.remove('Car_Name')
for items in c : 
    a=np.array(df2[items])
    a=a.reshape(-1,1)
    scaler.fit(a)
    a=scaler.transform(a)
    df2[items]=a
df2

In [None]:
# now we define x,y to use them further ahead in linear regression model 
x=pd.DataFrame(df2,columns=['Age', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner', 'dot features ','sin kms drive'])
y=df2['Selling_Price']

In [None]:
# because our target is price which cannot have a minus value,we use inheritance and method overriding 
# to change predict values to 0 when they are minus 
class LinearRegression(LinearRegression):
        def __init__(self):
                super().__init__()
        def predict(self,b):
                a=super().predict(b)
                for items in enumerate(a):
                        if items[1]<0:
                                a[items[0]]=0
                return a

In [None]:
# now we use train test split to define x_train and y_train to fit and train the model with them and
# evaluate the model whith x_test and Y_test further 
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0,test_size=0.1)
regressor=LinearRegression()
regressor.fit(X_train,y_train)
print(regressor.feature_names_in_)
print(regressor.coef_)

In [None]:
# now we predict the x_test values to see our model performance 
y_pred=regressor.predict(X_test)
y_pred

In [None]:
scores = cross_val_score(regressor, X_test, y_test, cv=10)
scores

In [None]:
print(metrics.mean_squared_error(y_test, y_pred))
print(metrics.r2_score(y_test, y_pred)*100)

In [None]:
# we can see that our r2score is pretty much good lets see the scatter plot of x_test with 
# both y_test bad y_pred in 2 features present price and kms driven

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X_test['Present_Price'],y_test,color='blue')
plt.scatter(X_test['Present_Price'],y_pred,color='red')
plt.legend(['True','Pred'])
plt.xlabel('present price ')
plt.ylabel('selling price')
plt.title('model evaluation (graph)')
plt.grid()

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X_test['Kms_Driven'],y_test,color='blue')
plt.scatter(X_test['Kms_Driven'],y_pred,color='red')
plt.legend(['True','Pred'])
plt.xlabel('present price ')
plt.ylabel('seeling price')
plt.title('model evaluation (graph)')
plt.grid()

In [None]:
# from charts above and r2score we can conclude that our model performance is good is all that left 
# is trann all data with our model to precit the given input of featues values 

In [None]:
###########################################################final result#####################################################

In [None]:
# this function will append the input row in df then after normalization predict anf give you the 
# price of car 
def f():
    '''this funtion is used to predict price of cars givens its features from user input'''
    ds=pd.read_csv('/kaggle/input/car-data/cardata.csv')
    df=pd.DataFrame(ds)
    #changing year column and add age column 
    df['Year']=df.Year.apply(lambda x : max(df.Year)+1-x)
    df.rename(columns={'Year':'Age'},inplace=True)
    df.drop('Car_Name',axis=1,inplace=True)
    df.drop([86,196],inplace=True)
    
    #appending input data to dataframe 
    df=df.append(pd.DataFrame([[x1,0,x2,x3,x4,x5,x6,x7]],columns=df.columns,index=[max(df.index)+1]))
    
    #converting categorical features into numerical 
    a={"Petrol":1,"CNG":0.8,"Diesel":2}
    b={"Dealer":2,"Individual":1}
    c={"Manual":1,"Automatic":2}
    df['Fuel_Type'].replace(a,inplace=True)
    df['Seller_Type'].replace(b,inplace=True)
    df['Transmission'].replace(c,inplace=True)
    
    # add two features (dot features and sin kms ) to datraframe to use in eguation of linear regression 
    df2=df.copy()
    a=list(zip(df['Fuel_Type'],df['Seller_Type'],df['Transmission']))
    b=[]
    for items in a : 
        z=1 
        for i in items :
            z=z*i 
        b.append(z)
    df2['dot features']= b 
    df2['sin kms driven']=df2['Kms_Driven'].apply(lambda x : np.sin(np.pi * x))
    
    #list of features 
    c=list(df2.columns)
    c.remove('Selling_Price')
    
    #normalizing features (scale range (0,1))
    scaler=MinMaxScaler()
    for items in c:
            b=np.array(df2[items])
            b=b.reshape(-1,1)
            scaler.fit(b)
            b=scaler.transform(b)
            df2[items]=b
    
    #save last row of dataframe whch is normalized input data in variable a and remove that row from dataframe 
    a=list(df2[df2.index==max(df.index)].values[0])
    a.remove(a[1])
    df2=df2[df2.index<max(df.index)]
    
    # define x and y and train the model
    x=df2[c]
    y=df2['Selling_Price']
    regressor=LinearRegression()
    regressor.fit(x,y)
    
    # reshape the arraye of normalized input values and then predict them 
    a=np.array(a).reshape(1,-1)
    y_pred=regressor.predict(a)
    
    
    return y_pred

In [None]:
# x1=float(input('Age: ')) #10
# x2=float(input('Present Price: '))  #11.23
# x3=float(input('KMS Driven: '))  #42000
# x4=input('Fuel Type: ')  #Petrol
# x5=input('Seller Type: ')  #Dealer
# x6=input('Transmisson: ') #Manual
# x7=float(input('Owner: '))  #1
x1=10
x2=11.23 
x3=42000
x4='Petrol'
x5='Dealer'
x6='Manual'
x7=1

In [None]:
#predicted value
print(f()[0])