In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

In [2]:
# Importing CSV file into DataFrame
df_chocolate = pd.read_csv("flavors_of_cacao.csv")
print(df_chocolate.head())

  Company \n(Maker-if known) Specific Bean Origin\nor Bar Name   REF  \
0                   A. Morin                       Agua Grande  1876   
1                   A. Morin                             Kpime  1676   
2                   A. Morin                            Atsane  1676   
3                   A. Morin                             Akata  1680   
4                   A. Morin                            Quilla  1704   

   Review\nDate Cocoa\nPercent Company\nLocation  Rating Bean\nType  \
0          2016            63%            France    3.75              
1          2015            70%            France    2.75              
2          2015            70%            France    3.00              
3          2015            70%            France    3.50              
4          2015            70%            France    3.50              

  Broad Bean\nOrigin  
0           Sao Tome  
1               Togo  
2               Togo  
3               Togo  
4               Peru  


In [3]:
# Cleaning the data
# Checking for missing data
df_chocolate.isna().sum()

Company \n(Maker-if known)           0
Specific Bean Origin\nor Bar Name    0
REF                                  0
Review\nDate                         0
Cocoa\nPercent                       0
Company\nLocation                    0
Rating                               0
Bean\nType                           1
Broad Bean\nOrigin                   1
dtype: int64

In [4]:
# Renaming columns
df_chocolate.columns = ["Company", "Specific Bean Origin/Bar Name", "REF", "Review Date", "Cocoa Percentage", "Location", "Rating", "Bean Type", "Broad Bean Origin"]
df_chocolate.head()

Unnamed: 0,Company,Specific Bean Origin/Bar Name,REF,Review Date,Cocoa Percentage,Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [5]:
# Understanding blanks in 'Bean Type' column
list(df_chocolate["Bean Type"][0:3])

['\xa0', '\xa0', '\xa0']

In [17]:
# Removing non-breaking space in 'Bean Type' column
df_chocolate['Bean Type'].str.strip("\xa0")
print(df_chocolate.head())

    Company Specific Bean Origin/Bar Name   REF  Review Date Cocoa Percentage  \
0  A. Morin                   Agua Grande  1876         2016              63%   
1  A. Morin                         Kpime  1676         2015              70%   
2  A. Morin                        Atsane  1676         2015              70%   
3  A. Morin                         Akata  1680         2015              70%   
4  A. Morin                        Quilla  1704         2015              70%   

  Location  Rating Bean Type Broad Bean Origin  
0   France    3.75       NaN          Sao Tome  
1   France    2.75       NaN              Togo  
2   France    3.00       NaN              Togo  
3   France    3.50       NaN              Togo  
4   France    3.50       NaN              Peru  


In [18]:
# Understanding Nulls in df_chocolate
df_chocolate.isnull().sum()

Company                            0
Specific Bean Origin/Bar Name      0
REF                                0
Review Date                        0
Cocoa Percentage                   0
Location                           0
Rating                             0
Bean Type                        888
Broad Bean Origin                 74
dtype: int64

In [21]:
# Getting percentage of Bean Type that is missing 
missing_values_BT = df_chocolate['Bean Type'].isnull().sum()
missing_values_BT

888

In [22]:
total_rows = df_chocolate.shape[0]

In [35]:
missing_BT = (missing_values_BT / total_rows) 
print(missing_BT)

0.4947075208913649


In [36]:
percentage_missing_BT = "{:.0%}".format(missing_BT)
print(percentage_missing_BT)

49%


In [37]:
# Remove 'Bean Type' column due to high percentage of missing data
df_chocolate.drop(["Bean Type"], axis=1)

Unnamed: 0,Company,Specific Bean Origin/Bar Name,REF,Review Date,Cocoa Percentage,Location,Rating,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.00,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.50,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.50,Peru
...,...,...,...,...,...,...,...,...
1790,Zotter,Peru,647,2011,70%,Austria,3.75,Peru
1791,Zotter,Congo,749,2011,65%,Austria,3.00,Congo
1792,Zotter,Kerala State,749,2011,65%,Austria,3.50,India
1793,Zotter,Kerala State,781,2011,62%,Austria,3.25,India


In [38]:
# Indexing the DataFrame
df_chocolate.index

RangeIndex(start=0, stop=1795, step=1)

In [39]:
df_chocolate.set_index('Company').head()

Unnamed: 0_level_0,Specific Bean Origin/Bar Name,REF,Review Date,Cocoa Percentage,Location,Rating,Bean Type,Broad Bean Origin
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru
