In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

In [2]:
# Importing CSV file into DataFrame
df_chocolate = pd.read_csv("flavors_of_cacao.csv")
print(df_chocolate.head())

  Company \n(Maker-if known) Specific Bean Origin\nor Bar Name   REF  \
0                   A. Morin                       Agua Grande  1876   
1                   A. Morin                             Kpime  1676   
2                   A. Morin                            Atsane  1676   
3                   A. Morin                             Akata  1680   
4                   A. Morin                            Quilla  1704   

   Review\nDate Cocoa\nPercent Company\nLocation  Rating Bean\nType  \
0          2016            63%            France    3.75              
1          2015            70%            France    2.75              
2          2015            70%            France    3.00              
3          2015            70%            France    3.50              
4          2015            70%            France    3.50              

  Broad Bean\nOrigin  
0           Sao Tome  
1               Togo  
2               Togo  
3               Togo  
4               Peru  


In [3]:
# Cleaning the data
# Renaming columns
df_chocolate.columns = ["Company", "Specific Bean Origin/Bar Name", "REF", "Review Date", "Cocoa Percentage", "Location", "Rating", "Bean Type", "Broad Bean Origin"]
df_chocolate.head()

Unnamed: 0,Company,Specific Bean Origin/Bar Name,REF,Review Date,Cocoa Percentage,Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [4]:
# Cleaning the data
# Checking for missing data
df_chocolate.isna().sum()

Company                          0
Specific Bean Origin/Bar Name    0
REF                              0
Review Date                      0
Cocoa Percentage                 0
Location                         0
Rating                           0
Bean Type                        1
Broad Bean Origin                1
dtype: int64

In [5]:
# Looks like there is minimal missing data. According to the above, there is 1 missing values in both the Bean Type column and the Broad Bean Origin Frame column. 
# Print out topline information (head) of both columns to explore this further. 
df_chocolate[['Bean Type', 'Broad Bean Origin']].head()

Unnamed: 0,Bean Type,Broad Bean Origin
0,,Sao Tome
1,,Togo
2,,Togo
3,,Togo
4,,Peru


In [6]:
# Clearly there is more than just 1 missing value in Bean Type
# Work out how many values are actually missing
df_chocolate['Bean Type'].value_counts()

                            887
Trinitario                  419
Criollo                     153
Forastero                    87
Forastero (Nacional)         52
Blend                        41
Criollo, Trinitario          39
Forastero (Arriba)           37
Criollo (Porcelana)          10
Trinitario, Criollo           9
Forastero (Parazinho)         8
Forastero (Arriba) ASS        6
Matina                        3
Nacional (Arriba)             3
Beniano                       3
EET                           3
Criollo (Ocumare 61)          2
Criollo, Forastero            2
Trinitario, Forastero         2
Trinitario (85% Criollo)      2
Amazon mix                    2
Nacional                      2
Criollo (Amarru)              2
Forastero (Catongo)           2
Amazon, ICS                   2
Forastero (Amelonado)         1
Trinitario (Scavina)          1
CCN51                         1
Trinitario (Amelonado)        1
Criollo (Ocumare 67)          1
Criollo (Ocumare 77)          1
Foraster

In [8]:
# According to the above there are 887 missing or blank values in the Bean Type column. 
# Get an understanding blanks in 'Bean Type' column
list(df_chocolate["Bean Type"][0:3])

['\xa0', '\xa0', '\xa0']

In [9]:
# Replace missing data (non-breaking space (\xa0)) with Unknown
# Use function to replace \xa0 

def unknowns(x):
    if x == "\xa0": 
        return "Unknown"
    
df_chocolate['Bean Type'] = df_chocolate['Bean Type'].apply(unknowns)
df_chocolate.head()

Unnamed: 0,Company,Specific Bean Origin/Bar Name,REF,Review Date,Cocoa Percentage,Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,Unknown,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,Unknown,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,Unknown,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,Unknown,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,Unknown,Peru


In [10]:
# Understanding Nulls in df_chocolate
df_chocolate.isnull().sum()

Company                            0
Specific Bean Origin/Bar Name      0
REF                                0
Review Date                        0
Cocoa Percentage                   0
Location                           0
Rating                             0
Bean Type                        908
Broad Bean Origin                  1
dtype: int64

In [11]:
# Getting percentage of Bean Type that is missing 
# First get number of missing values in Bean Type 
missing_values_BT = df_chocolate['Bean Type'].isnull().sum()
missing_values_BT

908

In [12]:
total_rows = df_chocolate.shape[0]

In [13]:
missing_BT = (missing_values_BT / total_rows) 
print(missing_BT)

0.5058495821727019


In [14]:
percentage_missing_BT = "{:.0%}".format(missing_BT)
print(percentage_missing_BT)

51%


In [15]:
# Remove 'Bean Type' column due to high percentage of missing data
df_chocolate.drop(["Bean Type"], axis=1)

Unnamed: 0,Company,Specific Bean Origin/Bar Name,REF,Review Date,Cocoa Percentage,Location,Rating,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.00,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.50,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.50,Peru
...,...,...,...,...,...,...,...,...
1790,Zotter,Peru,647,2011,70%,Austria,3.75,Peru
1791,Zotter,Congo,749,2011,65%,Austria,3.00,Congo
1792,Zotter,Kerala State,749,2011,65%,Austria,3.50,India
1793,Zotter,Kerala State,781,2011,62%,Austria,3.25,India
