# Importar os módulos necessários

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        mean_squared_error,
        mean_absolute_error,
        r2_score
)
from sklearn.model_selection import cross_validate, KFold
from sklearn.tree import DecisionTreeRegressor  # decision trees for regression
from sklearn.neural_network import MLPRegressor  # neural networks for regression
from sklearn.svm import SVR  # support vector machines for regression
from sklearn.ensemble import RandomForestRegressor # Random forest regression

# Definir as Métricas para Avaliação dos Modelos

In [3]:
from sklearn.metrics import make_scorer


METRICS = {
      "mse": make_scorer(mean_squared_error),
      "mae": make_scorer(mean_absolute_error),
      "r2": make_scorer(r2_score)
}

# Ler o Conjunto de Dados

In [4]:
train_data = pd.read_csv("./TrainDataScenario1.csv")
test_data = pd.read_csv("./TestDataScenario1.csv")

In [4]:
train_data.dropna(inplace=True)

In [5]:
train_data.shape

(12912, 66)

In [None]:
train_data.drop(columns=['1', '2', 'Convertibles', 'Hyundai', 'Isuzu', 'Mercedes-Benz', 'Skoda', 'Volvo', 'Wagon'],inplace=True)

In [None]:
features = train_data.columns.difference(['resale_price_Lakh'])

In [None]:
X = train_data[features]
y = train_data['resale_price_Lakh']

In [None]:
X_test= test_data[features]

# Definir o Método de Validação Cruzada

In [None]:
splitter = KFold(n_splits=10, shuffle=True, random_state=1234)

In [None]:
X

In [None]:
y

# Implementação dos Algoritmos de Machine Learning

### Árvores de Decisão

In [None]:
dt = DecisionTreeRegressor(max_depth=5, random_state=1234)
splitter = KFold(10, random_state=1234, shuffle=True)
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

### Redes Neuronais

In [None]:
mlp = MLPRegressor(random_state=1234)
splitter = KFold(10, random_state=1234, shuffle=True)
scores_mlp = cross_validate(mlp, X, y, cv=splitter, scoring=METRICS)
mlp_scores = pd.DataFrame(scores_mlp)
pd.DataFrame(mlp_scores.mean()).T

### Support Vector Machine

In [None]:
svr = SVR()
splitter = KFold(10, random_state=1234, shuffle=True)
scores_svr = cross_validate(svr, X, y, cv=splitter, scoring=METRICS)
svr_scores = pd.DataFrame(scores_svr)
pd.DataFrame(svr_scores.mean()).T

In [None]:
rfr = RandomForestRegressor()
splitter = KFold(10, random_state=1234, shuffle=True)
scores_rfr = cross_validate(rfr, X, y, cv=splitter, scoring=METRICS)
rfr_scores = pd.DataFrame(scores_rfr)
pd.DataFrame(rfr_scores.mean()).T

In [None]:
rfr.fit(X,y)

In [None]:
# Replace NaN values in X_test with 0
X_test.fillna(0, inplace=True)

In [None]:
previsao = rfr.predict(X_test)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'prediction': previsao})

# Save the DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=True)

In [None]:
dt.fit(X,y)
mlp.fit(X,y)
svr.fit(X,y)

In [None]:
import matplotlib.pyplot as plt

# Function to create scatter plot for actual vs predicted values
def plot_actual_vs_predicted(model_name, y_true, y_pred):
    plt.figure(figsize=(8, 8))
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.title(f'{model_name} - Actual vs Predicted')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()

# Plot for Decision Tree Regressor
plot_actual_vs_predicted("Decision Tree Regressor", y, dt.predict(X))

# Plot for MLP Regressor
plot_actual_vs_predicted("MLP Regressor", y, mlp.predict(X))

# Plot for Support Vector Regressor
plot_actual_vs_predicted("Support Vector Regressor", y, svr.predict(X))

# Plot for Random Forest Regressor
plot_actual_vs_predicted("Random Forest Regressor", y, rfr.predict(X))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Function to calculate REC curve
def calculate_rec_curve(y_true, y_pred, tolerance_values):
    errors = np.abs(y_true - y_pred)
    rec_values = []

    for tolerance in tolerance_values:
        within_tolerance = np.sum(errors <= tolerance)
        percentage_within_tolerance = within_tolerance / len(y_true) * 100
        rec_values.append(percentage_within_tolerance)

    return rec_values

# Define a range of tolerance values
tolerance_values = np.linspace(0, 1)

# Calculate REC curve for each model
rec_decision_tree = calculate_rec_curve(y, dt.predict(X), tolerance_values)
rec_mlp = calculate_rec_curve(y, mlp.predict(X), tolerance_values)
rec_svr = calculate_rec_curve(y, svr.predict(X), tolerance_values)
rec_random_forest = calculate_rec_curve(y, rfr.predict(X), tolerance_values)

# Plot REC curve
plt.figure(figsize=(8, 6))
plt.plot(tolerance_values, rec_decision_tree, label='Decision Tree Regressor')
plt.plot(tolerance_values, rec_mlp, label='MLP Regressor')
plt.plot(tolerance_values, rec_svr, label='Support Vector Regressor')
plt.plot(tolerance_values, rec_random_forest, label='Random Forest Regressor')

plt.title('REC Curve for Regression Models')
plt.xlabel('Tolerance (Error)')
plt.ylabel('Percentage of Points within Tolerance')
plt.legend()
plt.show()

In [None]:
#train_forbi = pd.read_csv('TrainDataScenario1.csv')

In [None]:
#train_forbi_X, train_forbi_y = train_forbi, train_forbi.drop(columns=['resale_price_Lakh'], axis=1)

In [None]:
previsao = rfr.predict(X)

# Create a DataFrame with the predictions
BI_predictions_df = pd.DataFrame({'prediction': previsao})

# Save the DataFrame to a CSV file
BI_predictions_df.to_csv('BIPredictions.csv', index=True)

In [5]:
train_data.shape

(13485, 66)

In [6]:
databeforeencoding = pd.read_csv("./databeforeencoding.csv")

In [7]:
databeforeencoding.shape

(13485, 66)

In [8]:
databeforeencoding.dropna(inplace=True)

In [9]:
databeforeencoding.shape

(12912, 66)

In [None]:
#C1 = BI_predictions_df['prediction']

In [None]:
#C1

In [10]:
C1 = pd.read_csv("./BIPredictions.csv")

In [11]:
C1= C1['prediction']

In [12]:
C1

0         5.5156
1         6.7589
2         5.7150
3        23.3035
4         6.7437
          ...   
12907    29.0730
12908     5.7933
12909     7.2945
12910     9.2048
12911     4.8426
Name: prediction, Length: 12912, dtype: float64

In [13]:
BI_df = pd.concat([databeforeencoding.reset_index(drop=True),C1],axis=1)

In [14]:
BI_df

Unnamed: 0,registered_year,engine_capacity,kms_driven,max_power,seats,mileage,resale_price_Lakh,1,2,Comprehensive,...,Delhi,Gurgaon,Hyderabad,Jaipur,Kolkata,Lucknow,Mumbai,Pune,ID,prediction
0,2019,1199,30910,80.0,5.0,24.0,5.66,0,0,0,...,0,0,0,0,0,0,0,1,0,5.5156
1,2018,1199,48089,90.0,5.0,18.0,6.64,0,0,0,...,0,0,0,1,0,0,0,0,1,6.7589
2,2015,1497,51000,120.0,5.0,18.0,5.65,0,0,0,...,0,0,0,0,0,0,1,0,2,5.7150
3,2021,1956,30000,170.0,7.0,14.0,23.00,0,0,0,...,0,0,0,0,0,0,1,0,3,23.3035
4,2019,1197,61113,80.0,5.0,22.0,6.87,0,0,0,...,0,0,0,0,1,0,0,0,4,6.7437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12907,2021,2199,80000,200.0,7.0,14.0,26.50,0,0,0,...,0,0,1,0,0,0,0,0,13810,29.0730
12908,2017,1197,17923,80.0,5.0,18.0,5.87,0,0,0,...,0,1,0,0,0,0,0,0,13811,5.7933
12909,2018,1498,63389,110.0,5.0,22.0,7.43,0,0,1,...,0,0,1,0,0,0,0,0,13812,7.2945
12910,2017,1248,40000,90.0,5.0,24.0,9.45,0,0,1,...,0,0,0,0,0,0,0,0,13813,9.2048


In [15]:
BI_df.tail(20)

Unnamed: 0,registered_year,engine_capacity,kms_driven,max_power,seats,mileage,resale_price_Lakh,1,2,Comprehensive,...,Delhi,Gurgaon,Hyderabad,Jaipur,Kolkata,Lucknow,Mumbai,Pune,ID,prediction
12892,2019,1461,51146,110.0,5.0,20.0,9.04,0,0,1,...,0,0,0,0,0,0,1,0,13795,10.4435
12893,2017,1199,72000,90.0,5.0,18.0,5.0,0,0,1,...,1,0,0,0,0,0,0,0,13796,4.9484
12894,2016,1498,81980,100.0,5.0,22.0,6.05,0,0,0,...,0,0,0,0,0,0,0,0,13797,5.9252
12895,2017,1248,73686,70.0,5.0,26.0,6.2,0,0,0,...,0,0,0,0,0,0,0,0,13798,5.8799
12896,2015,1396,48000,110.0,5.0,18.0,3.99,0,0,1,...,0,0,0,0,1,0,0,0,13799,4.2719
12897,2012,1199,30000,80.0,5.0,18.0,1.7,0,0,0,...,0,0,0,0,1,0,0,0,13800,1.8539
12898,2017,1186,70000,70.0,5.0,26.0,3.6,0,0,0,...,0,1,0,0,0,0,0,0,13801,3.5681
12899,2018,1591,36229,120.0,5.0,14.0,10.89,0,0,0,...,0,1,0,0,0,0,0,0,13802,11.0484
12900,2016,1991,31000,180.0,5.0,14.0,22.5,0,0,1,...,0,0,0,0,0,0,0,1,13803,23.2465
12901,2010,995,50000,60.0,5.0,18.0,0.85,0,0,0,...,1,0,0,0,0,0,0,0,13804,1.0602


In [16]:
BI_df.columns

Index(['registered_year', 'engine_capacity', 'kms_driven', 'max_power',
       'seats', 'mileage', 'resale_price_Lakh', '1', '2', 'Comprehensive',
       'None', 'Not Available', 'Third Party', 'Third Party insurance',
       'Zero Dep', 'Automatic', 'Manual', 'Fifth Owner', 'First Owner',
       'Fourth Owner', 'Second Owner', 'Third Owner', 'Unknown', 'CNG',
       'Diesel', 'Electric', 'LPG', 'Petrol', 'BMW', 'Cars', 'Chevrolet',
       'Convertibles', 'Coupe', 'Datsun', 'Hatchback', 'Honda', 'Hyundai',
       'Isuzu', 'Jaguar', 'MUV', 'Mahindra', 'Maruti', 'Mercedes-Benz',
       'Minivans', 'Pickup', 'SUV', 'Sedan', 'Skoda', 'Tata', 'Toyota',
       'Volvo', 'Wagon', 'Agra', 'Ahmedabad', 'Bangalore', 'Chandigarh',
       'Chennai', 'Delhi', 'Gurgaon', 'Hyderabad', 'Jaipur', 'Kolkata',
       'Lucknow', 'Mumbai', 'Pune', 'ID', 'prediction'],
      dtype='object')

In [17]:
# Revert one-hot encoding for the 'city' column
BI_df['city'] = BI_df[['Agra', 'Ahmedabad', 'Bangalore', 'Chandigarh',
       'Chennai', 'Delhi', 'Gurgaon', 'Hyderabad', 'Jaipur', 'Kolkata',
       'Lucknow', 'Mumbai', 'Pune']].idxmax(axis=1)

# Drop the binary columns
BI_df.drop(['Agra', 'Ahmedabad', 'Bangalore', 'Chandigarh',
       'Chennai', 'Delhi', 'Gurgaon', 'Hyderabad', 'Jaipur', 'Kolkata',
       'Lucknow', 'Mumbai', 'Pune'], axis=1, inplace=True)

In [18]:
# Revert one-hot encoding for the 'city' column
BI_df['insurance'] = BI_df[['1', '2', 'Comprehensive','None', 'Not Available', 'Third Party', 'Third Party insurance','Zero Dep']].idxmax(axis=1)

# Drop the binary columns
BI_df.drop(['1', '2', 'Comprehensive','None', 'Not Available', 'Third Party', 'Third Party insurance','Zero Dep'], axis=1, inplace=True)

In [19]:
# Revert one-hot encoding for the 'city' column
BI_df['transmission_type'] = BI_df[['Automatic', 'Manual']].idxmax(axis=1)

# Drop the binary columns
BI_df.drop(['Automatic', 'Manual'], axis=1, inplace=True)

In [20]:
# Revert one-hot encoding for the 'city' column
BI_df['owner_type'] = BI_df[['Fifth Owner', 'First Owner','Fourth Owner', 'Second Owner', 'Third Owner', 'Unknown']].idxmax(axis=1)

# Drop the binary columns
BI_df.drop(['Fifth Owner', 'First Owner','Fourth Owner', 'Second Owner', 'Third Owner', 'Unknown'], axis=1, inplace=True)


In [21]:
# Revert one-hot encoding for the 'city' column
BI_df['fuel_type'] = BI_df[['CNG','Diesel', 'Electric', 'LPG', 'Petrol']].idxmax(axis=1)

# Drop the binary columns
BI_df.drop(['CNG','Diesel', 'Electric', 'LPG', 'Petrol'], axis=1, inplace=True)

In [22]:
# Revert one-hot encoding for the 'city' column
BI_df['body_type'] = BI_df[['BMW', 'Cars', 'Chevrolet',
       'Convertibles', 'Coupe', 'Datsun', 'Hatchback', 'Honda', 'Hyundai',
       'Isuzu', 'Jaguar', 'MUV', 'Mahindra', 'Maruti', 'Mercedes-Benz',
       'Minivans', 'Pickup', 'SUV', 'Sedan', 'Skoda', 'Tata', 'Toyota',
       'Volvo', 'Wagon']].idxmax(axis=1)

# Drop the binary columns
BI_df.drop(['BMW', 'Cars', 'Chevrolet',
       'Convertibles', 'Coupe', 'Datsun', 'Hatchback', 'Honda', 'Hyundai',
       'Isuzu', 'Jaguar', 'MUV', 'Mahindra', 'Maruti', 'Mercedes-Benz',
       'Minivans', 'Pickup', 'SUV', 'Sedan', 'Skoda', 'Tata', 'Toyota',
       'Volvo', 'Wagon'], axis=1, inplace=True)

In [23]:
BI_df

Unnamed: 0,registered_year,engine_capacity,kms_driven,max_power,seats,mileage,resale_price_Lakh,ID,prediction,city,insurance,transmission_type,owner_type,fuel_type,body_type
0,2019,1199,30910,80.0,5.0,24.0,5.66,0,5.5156,Pune,Third Party,Manual,First Owner,Petrol,Hatchback
1,2018,1199,48089,90.0,5.0,18.0,6.64,1,6.7589,Jaipur,Zero Dep,Manual,Third Owner,Petrol,SUV
2,2015,1497,51000,120.0,5.0,18.0,5.65,2,5.7150,Mumbai,Third Party insurance,Manual,Second Owner,Petrol,Sedan
3,2021,1956,30000,170.0,7.0,14.0,23.00,3,23.3035,Mumbai,Third Party insurance,Automatic,First Owner,Diesel,SUV
4,2019,1197,61113,80.0,5.0,22.0,6.87,4,6.7437,Kolkata,Zero Dep,Automatic,First Owner,Petrol,Hatchback
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12907,2021,2199,80000,200.0,7.0,14.0,26.50,13810,29.0730,Hyderabad,Third Party insurance,Automatic,Second Owner,Diesel,MUV
12908,2017,1197,17923,80.0,5.0,18.0,5.87,13811,5.7933,Gurgaon,Third Party insurance,Manual,First Owner,Petrol,Hatchback
12909,2018,1498,63389,110.0,5.0,22.0,7.43,13812,7.2945,Hyderabad,Comprehensive,Manual,Second Owner,Diesel,Sedan
12910,2017,1248,40000,90.0,5.0,24.0,9.45,13813,9.2048,Bangalore,Comprehensive,Manual,Second Owner,Diesel,SUV


In [18]:
BI_df.to_csv('BI.csv', index=True)