In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

In [None]:
raw_data = pd.read_csv("../data/cars.csv")
raw_data

Pré processamento

In [None]:
raw_data.describe(include="all")

Determinando as variáveis de interesse

In [None]:
data = raw_data.drop(["Model"], axis=1)

In [None]:
data.isna().sum()

In [None]:
data_no_mv = data.dropna(axis=0)

In [None]:
data_no_mv

In [None]:
sns.distplot(data_no_mv["Price"])

In [None]:
q = data_no_mv["Price"].quantile(0.99)
data_1 = data_no_mv[data_no_mv["Price"] < q]
data_1.describe(include="all")

In [None]:
sns.distplot(data_1["Price"])

In [None]:
q = data_1["Mileage"].quantile(0.99)
data_2 = data_1[data_1["Mileage"] < q]
data_2.describe(include="all")

In [None]:
sns.distplot(data_2["Mileage"])

In [None]:
data_3 = data_2[data_2["EngineV"] < 6.5]
data_3.describe(include="all")

In [None]:
sns.distplot(data_3["EngineV"])

In [None]:
q = data_3["Year"].quantile(0.01)
data_4 = data_3[data_3["Year"] > q]

In [None]:
sns.distplot(data_4["Year"])

In [None]:
data_cleaned = data_4.reset_index(drop=True)

In [None]:
data_cleaned.describe(include="all")

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(15,3))
ax1.scatter(data_cleaned["Year"], data_cleaned["Price"])
ax1.set_title("Preço e Ano")
ax2.scatter(data_cleaned["Mileage"], data_cleaned["Price"])
ax2.set_title("Preço e Quilometragem")
ax3.scatter(data_cleaned["EngineV"], data_cleaned["Price"])
ax3.set_title("Preço e Motor")

plt.show()

In [None]:
log_price = np.log(data_cleaned["Price"])
data_cleaned["log_price"] = log_price
data_cleaned

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(15,3))
ax1.scatter(data_cleaned["Year"], data_cleaned["log_price"])
ax1.set_title("Preço e Ano")
ax2.scatter(data_cleaned["Mileage"], data_cleaned["log_price"])
ax2.set_title("Preço e Quilometragem")
ax3.scatter(data_cleaned["EngineV"], data_cleaned["log_price"])
ax3.set_title("Preço e Motor")

plt.show()

In [None]:
data_cleaned = data_cleaned.drop(["Price"], axis=1)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = data_cleaned[["Mileage", "Year", "EngineV"]]
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["features"] = variables.columns


In [None]:
# VIF = 1: sem multicolinearidade
# 1 < VIF < 5 ok
# 10 < VIF inaceitável
vif

In [None]:
data_no_multicollinearity = data_cleaned.drop(["Year"], axis=1)

In [None]:
data_with_dummies = pd.get_dummies(data_no_multicollinearity, drop_first=True)

In [None]:
data_with_dummies.head()

In [None]:
data_with_dummies.columns.values

In [None]:
cols = ['log_price', 'Mileage', 'EngineV', 'Brand_BMW',
       'Brand_Mercedes-Benz', 'Brand_Mitsubishi', 'Brand_Renault',
       'Brand_Toyota', 'Brand_Volkswagen', 'Body_hatch', 'Body_other',
       'Body_sedan', 'Body_vagon', 'Body_van', 'Engine Type_Gas',
       'Engine Type_Other', 'Engine Type_Petrol', 'Registration_yes']

In [None]:
data_preprocessed = data_with_dummies[cols]
data_preprocessed.head()

In [None]:
target = data_preprocessed["log_price"]
inputs = data_preprocessed.drop(["log_price"], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(inputs)


In [None]:
inputs_scaled = scaler.transform(inputs)

In [None]:
inputs_scaled

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs_scaled, target, test_size=0.2, random_state=365)


In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)


In [None]:
y_hat = reg.predict(X_train)

In [None]:
plt.scatter(y_train, y_hat)
plt.xlim(6, 13)
plt.ylim(6, 13)
plt.show()

In [None]:
sns.distplot(y_train - y_hat)

In [None]:
reg.score(X_train, y_train)

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
reg_summary = pd.DataFrame(inputs.columns.values, columns=["Features"])
reg_summary["Weights"] = reg.coef_
reg_summary

In [None]:
data_cleaned["Engine Type"].unique()

In [None]:
y_hat_test = reg.predict(X_test)

In [None]:
plt.scatter(y_test, y_hat_test, alpha=0.2)
plt.xlim(6, 13)
plt.ylim(6, 13)
plt.show()

In [None]:
df_pf = pd.DataFrame(np.exp(y_hat_test), columns=["Previsões"])
df_pf.head()

In [None]:
df_pf["Target"] = np.exp(y_test)
df_pf.head()

In [None]:
y_test = y_test.reset_index(drop=True)

In [None]:
df_pf["Target"] = np.exp(y_test)
df_pf.head()

In [None]:
df_pf["Resíduos"] = df_pf["Target"] - df_pf["Previsões"]

In [None]:
df_pf["Diferença %"] = np.absolute(df_pf["Resíduos"]/df_pf["Target"] * 100)

In [None]:
df_pf.describe()

In [None]:
pd.options.display.max_rows = 999
pd.set_option("display.float_format", lambda x: "%.2f" % x)
df_pf.sort_values(by=["Diferença %"])