# Setup

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import iqr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Baseline

In [2]:
diamonds_history = pd.read_csv('00-diamonds.csv')

FileNotFoundError: [Errno 2] No such file or directory: '00-diamonds.csv'

In [None]:
diamonds_history

In [None]:
diamonds_history.describe()

In [None]:
# Check missing values
diamonds_history.isna().sum()

In [None]:
# Backup original dataset
raw_diamond_history = diamonds_history.copy()
raw_diamond_history

In [None]:
# Mean price for the original dataset
raw_mean_price = raw_diamond_history.price.mean()
raw_mean_price

In [None]:
# Import Rick's dataset
rick_diamonds = pd.read_csv('00-rick_diamonds.csv')

# Check the result
rick_diamonds

In [None]:
# Baseline
raw_rick_price = rick_diamonds.copy()
raw_rick_price['price_predicted'] = raw_mean_price
raw_rick_price

In [None]:
# Export the dataset
raw_rick_price.to_csv('LG-baseline_model-0.csv', index=False)

RMSE = 3969

# Exploratory Data Analysis

In [None]:
diamonds_history.info()

There are 3 categorical variables and their values need to be converted to numbers.

## Convert categorical variables

### History

#### Cut

In [None]:
diamonds_history['cut'].unique()

In [None]:
[index + 1 for index in range(len(diamonds_history['cut'].unique()))]

In [None]:
cut_num = {'Fair': '1', 'Good': '2', 'Very Good': '3', 'Premium': '4', 'Ideal': '5'}

In [None]:
diamonds_history['cut_num'] = diamonds_history['cut'].apply(lambda x : int(x.replace(x, cut_num[x])))

In [None]:
diamonds_history

#### Color

In [None]:
color_cat = list(diamonds_history['color'].unique())
color_cat.sort(reverse=True)
color_cat

In [None]:
color_cat_num = {letter: str(index + 1) for index, letter in enumerate(color_cat)}
color_cat_num

In [None]:
diamonds_history['color_num'] = diamonds_history['color'].apply(lambda x : int(x.replace(x, color_cat_num[x])))

In [None]:
diamonds_history

#### Clarity

A measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

In [None]:
clarity_cat = {'I1': '1', 'SI2': '2', 'SI1': '3', 'VS2': '4', 'VS1': '5', 'VVS2': '6', 'VVS1': '7', 'IF': '8'}

In [None]:
diamonds_history['clarity_num'] = diamonds_history['clarity'].apply(lambda x : int(x.replace(x, clarity_cat[x])))
diamonds_history

### Rick

In [None]:
# Create a backup
raw_rick_diamonds = rick_diamonds.copy()

#### Cut

In [None]:
rick_diamonds['cut_num'] = rick_diamonds['cut'].apply(lambda x : int(x.replace(x, cut_num[x])))
rick_diamonds

In [None]:
rick_diamonds['color_num'] = rick_diamonds['color'].apply(lambda x : int(x.replace(x, color_cat_num[x])))
rick_diamonds

In [None]:
rick_diamonds['clarity_num'] = rick_diamonds['clarity'].apply(lambda x : int(x.replace(x, clarity_cat[x])))
rick_diamonds

# First Linear Regression

In [None]:
# Instance the model
model = LinearRegression()

X = diamonds_history[['carat', 'cut_num', 'color_num', 'clarity_num', 'depth', 'table', 'x', 'y', 'z']]
y = diamonds_history['price']

model.fit(X, y)

In [None]:
rick_predicted = model.predict(rick_diamonds[['carat', 'cut_num', 'color_num', 'clarity_num', 'depth', 'table', 'x',
                                              'y', 'z']])

In [None]:
rick_predicted

In [None]:
rick_final = raw_rick_diamonds.copy()
rick_final['price_predicted'] = rick_predicted
rick_final

In [None]:
rick_final.to_csv('LG-price-predicted.csv', index=False)

RMSE = 1230

R² = 90.40%

# Exploratory Data Analysis

In [None]:
diamonds_history.corr()

In [None]:
# Correlation
corr = diamonds_history[['carat', 'cut_num', 'color_num', 'clarity_num', 'depth', 'table', 'x', 'y', 'z', 'price']].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Plot the heatmap
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True);

- Carat, x, y and z are the variables that influency the most in the price.
- The depth is the least relevant e seria redundante usá-lo no modelo, pois ele já é dependente/composto pelas variáveis x, y e z

# Modelo usando carat, x, y, z

In [None]:
# Instance the model
model = LinearRegression()

X = diamonds_history[['carat', 'x', 'y', 'z']]
y = diamonds_history['price']

# Performe the model
model.fit(X, y)

# Predict
rick_predicted = model.predict(rick_diamonds[['carat', 'x', 'y', 'z']])

# Create a new dataset
rick_final = raw_rick_diamonds.copy()
rick_final['price_predicted'] = rick_predicted
rick_final

# Export the dataset
rick_final.to_csv('LG-price-predicted.csv', index=False)

In [None]:
rick_predicted

RMSE = 1538

# Modelo no depth

In [None]:
# Instance the model
model = LinearRegression()

X = diamonds_history[['carat', 'cut_num', 'color_num', 'clarity_num', 'table', 'x', 'y', 'z']]
y = diamonds_history['price']

# Performe the model
model.fit(X, y)

# Predict
rick_predicted = model.predict(rick_diamonds[['carat', 'cut_num', 'color_num', 'clarity_num', 'table', 'x', 'y', 'z']])

# Create a new dataset
rick_final = raw_rick_diamonds.copy()
rick_final['price_predicted'] = rick_predicted
rick_final

# Export the dataset
rick_final.to_csv('LG-price-predicted.csv', index=False)

RMSE = 1232

R² = 90.36%

# Modelo no x, y, z

In [None]:
# Instance the model
model = LinearRegression()

X = diamonds_history[['carat', 'cut_num', 'color_num', 'clarity_num', 'depth', 'table']]
y = diamonds_history['price']

# Performe the model
model.fit(X, y)

# Predict
rick_predicted = model.predict(rick_diamonds[['carat', 'cut_num', 'color_num', 'clarity_num', 'depth', 'table']])

# Create a new dataset
rick_final = raw_rick_diamonds.copy()
rick_final['price_predicted'] = rick_predicted
rick_final

# Export the dataset
rick_final.to_csv('LG-price-predicted.csv', index=False)

RMSE = 1247


R² = 90.17%

# Exploratory Data Analysis

In [None]:
diamonds_history.shape

In [None]:
diamonds_history.describe()

## Check min x, y and z

In [None]:
diamonds_history[(diamonds_history.x == 0) | (diamonds_history.y == 0) | (diamonds_history.z == 0)]

In [None]:
diamonds_history[(diamonds_history.x == 0) | (diamonds_history.y == 0) & (diamonds_history.z == 0)]

In [None]:
diamonds_history = diamonds_history[~((diamonds_history.x == 0) | (diamonds_history.y == 0) & (diamonds_history.z == 0))].reset_index(drop=True)
diamonds_history

In [None]:
# Instance the model
model = LinearRegression()

X = diamonds_history[['carat', 'cut_num', 'color_num', 'clarity_num', 'table', 'x', 'y', 'z']]
y = diamonds_history['price']

# Performe the model
model.fit(X, y)

# Predict
rick_predicted = model.predict(rick_diamonds[['carat', 'cut_num', 'color_num', 'clarity_num', 'table', 'x', 'y', 'z']])

# Create a new dataset
rick_final = raw_rick_diamonds.copy()
rick_final['price_predicted'] = rick_predicted
rick_final

# Export the dataset
rick_final.to_csv('LG-price-predicted.csv', index=False)

RMSE = 1232

R² = 90.36%

# Exploratory Data Analysis

| Column  | Description  |
|---|---|
| Price  | Price in US dollars (326-18,823)  |
| Carat  | Weight of the diamond (0.2--5.01)  |
| Cut  | Quality of the cut (Fair, Good, Very Good, Premium, Ideal)  |
| Color  | Diamond colour, from J (worst) to D (best)  |
| Clarity  | A measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))   |
| x  | Length in mm (0--10.74)  |
| y  | Width in mm (0--58.9)  |
| z  | Depth in mm (0--31.8)  |
| Depth  | Total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)  |
| Table  | Width of top of diamond relative to widest point (43--95)  |

In [None]:
diamonds_history[(diamonds_history.x == 0) | (diamonds_history.y == 0) | (diamonds_history.z == 0)]

In [None]:
np.mean([3, 4])

In [None]:
diamonds_history.index

In [None]:
diamonds_history.shape

In [None]:
diamonds_history['z_calc'] = [diamonds_history.iloc[index, 4] * np.mean([diamonds_history.iloc[index, 7],diamonds_history.iloc[index, 8]]) / 100 if diamonds_history.iloc[index, 9] == 0 else diamonds_history.iloc[index, 9] for index in diamonds_history.index]
diamonds_history

In [None]:
diamonds_history[(diamonds_history.x == 0) | (diamonds_history.y == 0) | (diamonds_history.z == 0)]

In [None]:
# Instance the model
model = LinearRegression()

X = diamonds_history[['carat', 'cut_num', 'color_num', 'clarity_num', 'table', 'x', 'y', 'z']]
y = diamonds_history['price']

# Performe the model
model.fit(X, y)

# Predict
rick_predicted = model.predict(rick_diamonds[['carat', 'cut_num', 'color_num', 'clarity_num', 'table', 'x', 'y', 'z']])

# Create a new dataset
rick_final = raw_rick_diamonds.copy()
rick_final['price_predicted'] = rick_predicted
rick_final

# Export the dataset
rick_final.to_csv('LG-price-predicted.csv', index=False)

RMSE = 1232

R² = 90.36%

# Exploratory Data Analysis

## Check z

In [None]:
sns.boxplot(y=diamonds_history.z)

In [None]:
diamonds_history[diamonds_history.z > 25]

In [None]:
iqr(diamonds_history.z)

# Exploratory Data Analysis

# Exploratory Data Analysis

# Exploratory Data Analysis

In [None]:
sns.pairplot(diamonds_history[['carat', 'cut_num', 'color_num', 'clarity_num', 'depth', 'table', 'x', 'y', 'z', 'price']])

In [None]:
# Plot boxplot
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(20, 10))
sns.boxplot(y=diamonds_history.carat, ax=ax[0])
sns.boxplot(y=diamonds_history.depth, ax=ax[1])
sns.boxplot(y=diamonds_history.table, ax=ax[2])
sns.boxplot(y=diamonds_history.x, ax=ax[3])
sns.boxplot(y=diamonds_history.y, ax=ax[4])
sns.boxplot(y=diamonds_history.z, ax=ax[5])
plt.subplots_adjust(wspace=0.5)

In [None]:
sns.boxplot(y=diamonds_history.carat)

In [None]:
sns.boxplot(y=diamonds_history.depth)

In [None]:
sns.boxplot(y=diamonds_history.table)

In [None]:
sns.boxplot(y=diamonds_history.x)

In [None]:
sns.boxplot(y=diamonds_history.y)

In [None]:
sns.boxplot(y=diamonds_history.z)