# Import Library

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns

# Exploratory Data Analysis

In [9]:
data_path = 'data/diamonds.csv'

with open(data_path,'r') as csvfile:
    data = pd.read_csv(csvfile)

print(data) #see sample data
data.info() #checking info
data.describe() #statistic description

       carat        cut color clarity  depth  table  price     x     y     z
0       0.23      Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1       0.21    Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2       0.23       Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3       0.29    Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4       0.31       Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
...      ...        ...   ...     ...    ...    ...    ...   ...   ...   ...
53935   0.72      Ideal     D     SI1   60.8   57.0   2757  5.75  5.76  3.50
53936   0.72       Good     D     SI1   63.1   55.0   2757  5.69  5.75  3.61
53937   0.70  Very Good     D     SI1   62.8   60.0   2757  5.66  5.68  3.56
53938   0.86    Premium     H     SI2   61.0   58.0   2757  6.15  6.12  3.74
53939   0.75      Ideal     D     SI2   62.2   55.0   2757  5.83  5.87  3.64

[53940 rows x 10 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex:

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


## Column Description

|Column|Description                                                      |
|:-----|:---------------------------------------------------------------|
|carat |weight of the diamond (0.2--5.01)|
|cut |quality of the cut (Fair, Good, Very Good, Premium, Ideal)|
|color |diamond colour, from J (worst) to D (best)|
|clarity |a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))|
|depth |total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)|
|table |width of top of diamond relative to widest point (43--95)|
|price |price in US dollars (\$326--\$18,823)|
|x |length in mm (0--10.74)|
|y |width in mm (0--58.9)|
|y |depth in mm (0--31.8)|


It can be seen that the minimum value of the variables x, y, and z is 0. The variables x, y, and z show the dimensions so that it is suspected that there is a missing value in the variables x, y, and z. The following are the steps taken to handle missing values. First, check for missing values.

In [12]:
#check the number of missing value for each variable
x = (data.x == 0).sum()
y = (data.y == 0).sum()
z = (data.z == 0).sum()

print(f"In column x there are/is {x} missing value(s)")
print(f"In column y there are/is {y} missing value(s)")
print(f"In column z there are/is {z} missing value(s)")

In column x there are/is 8 missing value(s)
In column y there are/is 7 missing value(s)
In column z there are/is 20 missing value(s)


because the number of missing values tends to be small, it can be done deleting rows on rows that contain missing values. For this reason, a new variable is created that stores the data after it has been processed

In [15]:
#create new_data variable to store processed data
new_data = data.loc[(data[['x','y','z']]!=0).all(axis=1)]

#check data shape
new_data.shape

(53920, 10)