In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('ggplot')

In [None]:
car=pd.read_csv('../dataset/quikr_car.csv')
print(car.shape)
print(car.info())
car.head()

## Data Set Quality Issues:

- Inconsistent naming conventions for entries.
- Company names are appended to individual names.
- Some names contain spam-like elements, such as 'Maruti Ertiga showroom condition with' or 'Well-maintained Tata Sumo.'
- Company field includes entries like 'Used' and 'URGENT' which are not actual company names.
- Year field contains non-year values and is currently stored as an object; it should be converted to integers.
- 'Ask for Price' is present in the Price field.
- Price values contain commas and are stored as objects.
- kms_driven field has a mix of object values with 'kms' at the end, and it includes NaN values. Additionally, two rows contain 'Petrol.'
- fuel_type field has NaN values.
- Will Keep first three words of the name.

## Cleaning Data 

### 1. year has many non-year values

In [None]:
car=car[car['year'].str.isnumeric()]

### 2. year is in object. Change to integer

In [None]:
car['year']=car['year'].astype(int)

### 3. Price has Ask for Price

In [None]:
car=car[car['Price']!='Ask For Price']

### 4. Price has commas in its prices and is in object

In [None]:
car['Price']=car['Price'].str.replace(',','').astype(int)

###  5. kms_driven has object values with kms at last.

In [None]:
car['kms_driven']=car['kms_driven'].str.split().str.get(0).str.replace(',','')

### 6. It has nan values and two rows have 'Petrol' in them

In [None]:
#Only keeping the numeric values in the kms_driven column.
car=car[car['kms_driven'].str.isnumeric()]

In [None]:
car['kms_driven']=car['kms_driven'].astype(int)

### 7. fuel_type has nan values

In [None]:
car=car[~car['fuel_type'].isna()]

In [None]:
car.shape

## Spam data in the name and company fields has been successfully eliminated through previous cleaning processes, resulting in the removal of corresponding rows.

### 8. Company does not need any cleaning now. Changing car names. Keeping only the first three words

In [None]:
car['name']=car['name'].str.split().str.slice(start=0,stop=3).str.join(' ')

In [None]:
car

### 9. Resetting the index of the final cleaned data

In [None]:
#This code realigns the 'car' DataFrame index post data removal for accurate indexing.
car=car.reset_index(drop=True)

## Cleaned Data

In [None]:
car

In [None]:
car.to_csv('Cleaned_Car_data.csv')
car.info()

In [None]:
car.describe(include='all')

### An anomaly is identified in the dataset; while the majority (75%) of our cars fall within the 5-6 lakh range, an outlier is present with a maximum value of 85 lakhs.

In [None]:
#Checking how many cars occur over 60lakhs to check for the outliers.
car[car['Price']>6e6]

### Removing the Outlier by keeping the cars which have a value under 60 lakhs.

In [None]:
car=car[car['Price']<6e6].reset_index(drop=True)
car

In [None]:
car.to_csv('new_file.csv', index=False)

### Checking relationship of Company with Price

In [None]:
car['company'].unique()

In [None]:
import seaborn as sns

In [None]:
plt.subplots(figsize=(15,7))
ax=sns.boxplot(x='company',y='Price',data=car)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

### Checking relationship of Year with Price

In [None]:
plt.subplots(figsize=(20,10))
ax=sns.swarmplot(x='year',y='Price',data=car)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

### Checking relationship of kms_driven with Price

In [None]:
sns.relplot(x='kms_driven',y='Price',data=car,height=7,aspect=1.5)

### Checking relationship of Fuel Type with Price

In [None]:
plt.subplots(figsize=(14,7))
sns.boxplot(x='fuel_type',y='Price',data=car)

### Relationship of Price with FuelType, Year and Company mixed

In [None]:
ax=sns.relplot(x='company',y='Price',data=car,hue='fuel_type',size='year',height=7,aspect=2)
ax.set_xticklabels(rotation=40,ha='right')

### Extracting Training Data

In [None]:
X=car.drop('Price',axis=1)
y=car['Price']

In [None]:
print(y.shape)
X

### Applying Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

### Creating an OneHotEncoder object to contain all the possible categories

In [None]:
ohe=OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

In [None]:
#One hot encoder cateogries to be passed in the pipeline.
ohe.categories_

### Creating a column transformer to transform categorical columns

In [None]:
#The column transformer will encode specified categorical data columns and 
#allow the remaining columns to pass through without alteration.
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                     remainder='passthrough')

### Linear Regression Model

In [None]:
column_trans

In [None]:
lr = RandomForestRegressor(n_estimators=100, random_state=42)

### Making a pipeline

In [None]:
#Pipeline will first take the data and perform the one hot encoder to the data and the fit it to the linear
#regression model
pipe=make_pipeline(column_trans,lr)

### Fitting the  model

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred=pipe.predict(X_test)

### Checking R2 Score

In [None]:
r2_score(y_test,y_pred)

### Finding the model with a random state of TrainTestSplit where the model gives maximum r2_score

In [None]:
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr = RandomForestRegressor(n_estimators=100, random_state=42)
    pipe=make_pipeline(column_trans,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))
np.argmax(scores)

In [None]:
scores[np.argmax(scores)]

In [None]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))

### The best model is found at a certain random state for Linear Regression

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr = RandomForestRegressor(n_estimators=100, random_state=42)
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
import pickle
pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))
pipe.predict(pd.DataFrame(columns=['name','company','year','kms_driven','fuel_type'],
                          data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))

In [None]:
pipe.steps[0][1].transformers[0][1].categories[0]

In [None]:
import pickle
pickle.dump(pipe,open('RandomForestModel.pkl','wb'))
pipe.predict(pd.DataFrame(columns=['name','company','year','kms_driven','fuel_type'],
                          data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))