In [1]:
# Add our dependencies
import csv
import os
import pandas as pd

In [2]:
# Read in CSV file and print as a DataFrame
diamonds_df_original = pd.read_csv('data/diamonds_dataset.csv')
diamonds_df_original.head()

Unnamed: 0,id,url,shape,price,carat,cut,color,clarity,report,type,date_fetched
0,10086429,https://www.brilliantearth.com//loose-diamonds...,Round,400,0.3,Very Good,J,SI2,GIA,natural,2020-11-29 12-26 PM
1,10016334,https://www.brilliantearth.com//loose-diamonds...,Emerald,400,0.31,Ideal,I,SI1,GIA,natural,2020-11-29 12-26 PM
2,9947216,https://www.brilliantearth.com//loose-diamonds...,Emerald,400,0.3,Ideal,I,VS2,GIA,natural,2020-11-29 12-26 PM
3,10083437,https://www.brilliantearth.com//loose-diamonds...,Round,400,0.3,Ideal,I,SI2,GIA,natural,2020-11-29 12-26 PM
4,9946136,https://www.brilliantearth.com//loose-diamonds...,Emerald,400,0.3,Ideal,I,SI1,GIA,natural,2020-11-29 12-26 PM


In [3]:
# Drop 'data_fetched' and 'url' columns
diamonds_df = diamonds_df_original
diamonds_df.drop(columns=["date_fetched", "url"], inplace=True)

In [4]:
# Make 'id' the index
diamonds_df.set_index('id', inplace = True)
diamonds_df.head()

Unnamed: 0_level_0,shape,price,carat,cut,color,clarity,report,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10086429,Round,400,0.3,Very Good,J,SI2,GIA,natural
10016334,Emerald,400,0.31,Ideal,I,SI1,GIA,natural
9947216,Emerald,400,0.3,Ideal,I,VS2,GIA,natural
10083437,Round,400,0.3,Ideal,I,SI2,GIA,natural
9946136,Emerald,400,0.3,Ideal,I,SI1,GIA,natural


In [5]:
# Export to CSV for visuals
diamonds_df.to_csv("data/clean_data_visuals.csv")

In [6]:
# Find number of unique values for each column
diamonds_df.nunique()

shape        10
price      3144
carat       522
cut           5
color         7
clarity       8
report        4
type          2
dtype: int64

In [7]:
# Categories in 'shape' column
diamonds_df["shape"].unique()

array(['Round', 'Emerald', 'Marquise', 'Princess', 'Pear', 'Heart',
       'Oval', 'Cushion', 'Asscher', 'Radiant'], dtype=object)

In [8]:
# Categories in 'cut' column
diamonds_df["cut"].unique()

array(['Very Good', 'Ideal', 'Super Ideal', 'Good', 'Fair'], dtype=object)

In [9]:
# Categories in 'color' column
diamonds_df["color"].unique()

array(['J', 'I', 'E', 'F', 'G', 'H', 'D'], dtype=object)

In [10]:
# Categories in 'clarity' column
diamonds_df["clarity"].unique()

array(['SI2', 'SI1', 'VS2', 'VVS1', 'VS1', 'VVS2', 'IF', 'FL'],
      dtype=object)

In [11]:
# Categories in 'report' column
diamonds_df["report"].unique()

array(['GIA', 'HRD', 'IGI', 'GCAL'], dtype=object)

In [12]:
# Categories in 'type' column
diamonds_df["type"].unique()

array(['natural', 'lab'], dtype=object)

In [13]:
# New DataFrame name
diamonds_cleaning_df = diamonds_df

In [14]:
# Convert categories into numerical values for each categorical column (except 'type' as it will be our predictor)
cleanup_nums = {"shape":{"Asscher": 1, "Cushion": 2, "Emerald": 3, "Heart":4, "Marquise":5, "Oval":6, 
                         "Pear":7, "Princess":8, "Radiant":9, "Round":10},
                "cut": {"Fair": 1, "Good": 2, "Very Good": 3, "Ideal": 4,
                         "Super Ideal": 5},
               "color":{"J": 1, "I": 2, "H": 3, "G": 4, "F": 5, "E": 6, "D": 7},
               "clarity":{"SI2": 1, "SI1": 2, "VS2": 3, "VS1": 4, "VVS2": 5, "VVS1": 6, "IF": 7, "FL": 8},
               "report":{"GIA": 1, "HRD": 2, "IGI": 3, "GCAL": 4},
               "type": {"lab": 1, "natural": 2}}

In [15]:
# Replace categorical columns with new numerical values
diamonds_clean_df = diamonds_cleaning_df.replace(cleanup_nums)
diamonds_clean_df.head()

Unnamed: 0_level_0,shape,price,carat,cut,color,clarity,report,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10086429,10,400,0.3,3,1,1,1,2
10016334,3,400,0.31,4,2,2,1,2
9947216,3,400,0.3,4,2,3,1,2
10083437,10,400,0.3,4,2,1,1,2
9946136,3,400,0.3,4,2,2,1,2


In [16]:
# Check the data types
diamonds_clean_df.dtypes

shape        int64
price        int64
carat      float64
cut          int64
color        int64
clarity      int64
report       int64
type         int64
dtype: object

In [17]:
# Export to CSV for creating Multi Linear Model
diamonds_clean_df.to_csv("data/clean_data.csv")

In [18]:
# Convert 'type' back to "natural" and "lab" as it will be our predictor for our Logistic Regression Model.
cleanup_type = {"type": {1: "lab", 2: "natural"}}

In [19]:
# Replace 'type' column back to "natural" and "lab"
diamonds_new_clean_df = diamonds_clean_df.replace(cleanup_type)
diamonds_new_clean_df.head()

Unnamed: 0_level_0,shape,price,carat,cut,color,clarity,report,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10086429,10,400,0.3,3,1,1,1,natural
10016334,3,400,0.31,4,2,2,1,natural
9947216,3,400,0.3,4,2,3,1,natural
10083437,10,400,0.3,4,2,1,1,natural
9946136,3,400,0.3,4,2,2,1,natural


In [20]:
# Check the data types
diamonds_new_clean_df.dtypes

shape        int64
price        int64
carat      float64
cut          int64
color        int64
clarity      int64
report       int64
type        object
dtype: object

In [21]:
# Export to CSV for creating Logistic Model
diamonds_new_clean_df.to_csv("data/new_clean_data.csv")