In [1]:
# Import packages and data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

diamonds = pd.read_csv('/content/diamonds_casestudy.csv')
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,width,length,height
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
diamonds.groupby(by=['clarity']).size()

Unnamed: 0_level_0,0
clarity,Unnamed: 1_level_1
I1,741
IF,1790
SI1,13065
SI2,9194
VS1,8171
VS2,12258
VVS1,3655
VVS2,5066


In [3]:
# Combine clarity into fewer levels
diamonds['clarity2'] = diamonds.clarity.map(
    {'I1':'Included',
     'IF': 'Internally flawless',
     'SI1':'Slightly Included',
     'SI2':'Slightly Included',
     'VS1':'Very Slightly Included',
     'VS2':'Very Slightly Included',
     'VVS1':'Very Very Slightly Included',
     'VVS2':'Very Very Slightly Included'

     }
)

diamonds.groupby(by=['clarity2']).size()

Unnamed: 0_level_0,0
clarity2,Unnamed: 1_level_1
Included,741
Internally flawless,1790
Slightly Included,22259
Very Slightly Included,20429
Very Very Slightly Included,8721


In [4]:
# Diamond shape is not included in this dataset.
# But, the ratio of length to width can give some hints
shape = diamonds['length'] / diamonds['width']
shape.describe()

Unnamed: 0,0
count,53933.0
mean,inf
std,
min,0.6189759
25%,0.9931034
50%,1.004274
75%,1.007429
max,inf


In [5]:
diamonds['length'].describe()

Unnamed: 0,length
count,53940.0
mean,5.734526
std,1.142135
min,0.0
25%,4.72
50%,5.71
75%,6.54
max,58.9


In [6]:
diamonds['width'].describe()

Unnamed: 0,width
count,53940.0
mean,5.731157
std,1.121761
min,0.0
25%,4.71
50%,5.7
75%,6.54
max,10.74


In [7]:
# Remove diamonds with length=0 and width=0, and recalculate

diamonds2 = diamonds[(diamonds['width']>0) & (diamonds['length']>0)]
shape2 = diamonds2['length']/ diamonds2['width']

diamonds2.insert(value=shape, loc=11, column='shape')
diamonds2['shape'].describe()

Unnamed: 0,shape
count,53932.0
mean,1.000855
std,0.036366
min,0.618976
25%,0.993103
50%,1.004274
75%,1.007429
max,7.280593


In [8]:
# Display the diamonds with a high shape value
diamonds2[diamonds2['shape']>1.1]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,width,length,height,clarity2,shape
24067,2.0,Premium,H,SI2,58.9,57.0,12210,8.09,58.9,8.06,Slightly Included,7.280593
34282,0.39,Premium,H,SI2,61.2,58.0,468,4.51,6.02,4.44,Slightly Included,1.334812
48832,0.53,Ideal,F,VVS2,62.7,56.0,2030,5.16,6.2,3.25,Very Very Slightly Included,1.20155
49189,0.51,Ideal,E,VS1,61.8,55.0,2075,5.15,31.8,5.12,Very Slightly Included,6.174757
