## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

## Importing Data

In [2]:
# Establishing a path
path = r'/Users/Kate/Documents/Chocolate Bar Ratings Analysis'

In [7]:
# Importing chcolate bar data
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'flavors_of_cacao.csv'))

In [8]:
df.head()

Unnamed: 0,Company \n(Maker-if known),Specific Bean Origin\nor Bar Name,REF,Review\nDate,Cocoa\nPercent,Company\nLocation,Rating,Bean\nType,Broad Bean\nOrigin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


### Re-naming Columns

In [29]:
# Rename the 'Company \n(Maker-if known)' column to 'Company'
df = df.rename(columns={'Company \n(Maker-if known)': 'Company'})

In [12]:
# Rename the 'Specific Bean Origin\nor Bar Name' column to 'Specific Origin'
df = df.rename(columns={'Specific Bean Origin\nor Bar Name': 'Specific Origin'})

In [13]:
# Rename the 'Review\nDate' column to 'Review Date'
df = df.rename(columns={'Review\nDate': 'Review Date'})

In [14]:
# Rename the 'Cocoa\nPercent' column to 'Cocoa Percent'
df = df.rename(columns={'Cocoa\nPercent': 'Cocoa Percent'})

In [15]:
# Rename the 'Company\nLocation' column to 'Company Location'
df = df.rename(columns={'Company\nLocation': 'Company Location'})

In [16]:
# Rename the 'Bean\nType' column to 'Bean Variety'
df = df.rename(columns={'Bean\nType': 'Bean Variety'})

In [17]:
# Rename the 'Broad Bean\nOrigin' column to 'Broad Origin'
df = df.rename(columns={'Broad Bean\nOrigin': 'Broad Origin'})

In [52]:
# Changing names of columns
df.columns = ['Company', 'Specific Origin', 'Review Date', 'Cocoa Percent', 'Company Location', 'Rating', 'Bean Variety', 'Broad Origin']

In [53]:
df.head()

Unnamed: 0,Company,Specific Origin,Review Date,Cocoa Percent,Company Location,Rating,Bean Variety,Broad Origin
0,A. Morin,Agua Grande,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,2015,70%,France,3.5,,Peru


### Dropping unnecessary columns

In [31]:
# Dropping the "REF" column
df.drop('REF', axis=1, inplace=True)

In [60]:
# Dropping the "Specific Origin" column
df.drop('Specific Origin', axis=1, inplace=True)

In [61]:
df.head()

Unnamed: 0,Company,Review Date,Cocoa Percent,Company Location,Rating,Bean Variety,Broad Origin
0,A. Morin,2016,63%,France,3.75,,Sao Tome
1,A. Morin,2015,70%,France,2.75,,Togo
2,A. Morin,2015,70%,France,3.0,,Togo
3,A. Morin,2015,70%,France,3.5,,Togo
4,A. Morin,2015,70%,France,3.5,,Peru


### Checking for missing values

In [46]:
# Check for missing values
missing_values = df.isna().sum()

In [47]:
print(missing_values)

Company             0
Specific Origin     0
Date                0
Cocoa Percent       0
Company Location    0
Rating              0
Bean Variety        1
Broad Origin        1
dtype: int64


## Descriptive Statistics

In [40]:
df.shape

(1795, 8)

In [49]:
df.describe()

Unnamed: 0,Date,Rating
count,1795.0,1795.0
mean,2012.325348,3.185933
std,2.92721,0.478062
min,2006.0,1.0
25%,2010.0,2.875
50%,2013.0,3.25
75%,2015.0,3.5
max,2017.0,5.0


In [50]:
df.info

<bound method DataFrame.info of        Company     Specific Origin  Date Cocoa Percent Company Location  \
0     A. Morin         Agua Grande  2016           63%           France   
1     A. Morin               Kpime  2015           70%           France   
2     A. Morin              Atsane  2015           70%           France   
3     A. Morin               Akata  2015           70%           France   
4     A. Morin              Quilla  2015           70%           France   
...        ...                 ...   ...           ...              ...   
1790    Zotter                Peru  2011           70%          Austria   
1791    Zotter               Congo  2011           65%          Austria   
1792    Zotter        Kerala State  2011           65%          Austria   
1793    Zotter        Kerala State  2011           62%          Austria   
1794    Zotter  Brazil, Mitzi Blue  2010           65%          Austria   

      Rating Bean Variety Broad Origin  
0       3.75              

In [51]:
df.dtypes

Company              object
Specific Origin      object
Date                  int64
Cocoa Percent        object
Company Location     object
Rating              float64
Bean Variety         object
Broad Origin         object
dtype: object

In [55]:
df['Company Location'].value_counts()

U.S.A.               764
France               156
Canada               125
U.K.                  96
Italy                 63
Ecuador               54
Australia             49
Belgium               40
Switzerland           38
Germany               35
Austria               26
Spain                 25
Colombia              23
Hungary               22
Venezuela             20
Japan                 17
Brazil                17
Peru                  17
Madagascar            17
New Zealand           17
Denmark               15
Vietnam               11
Scotland              10
Guatemala             10
Israel                 9
Costa Rica             9
Argentina              9
Poland                 8
Lithuania              6
Honduras               6
South Korea            5
Nicaragua              5
Sweden                 5
Domincan Republic      5
Ireland                4
Netherlands            4
Fiji                   4
Sao Tome               4
Puerto Rico            4
Mexico                 4


There are two countries that have been misspelled, "Eucador" should be Ecuador, and "Niacragua" should be Nicaragua

In [57]:
# Replace the misspelled country name with the correct name
df["Company Location"] = df["Company Location"].replace("Eucador", "Ecuador")

In [58]:
# Replace the misspelled country name with the correct name
df["Company Location"] = df["Company Location"].replace("Niacragua", "Nicaragua")

In [59]:
# Checking
df['Company Location'].value_counts()

U.S.A.               764
France               156
Canada               125
U.K.                  96
Italy                 63
Ecuador               55
Australia             49
Belgium               40
Switzerland           38
Germany               35
Austria               26
Spain                 25
Colombia              23
Hungary               22
Venezuela             20
Brazil                17
Madagascar            17
Peru                  17
Japan                 17
New Zealand           17
Denmark               15
Vietnam               11
Scotland              10
Guatemala             10
Israel                 9
Argentina              9
Costa Rica             9
Poland                 8
Lithuania              6
Nicaragua              6
Honduras               6
Domincan Republic      5
South Korea            5
Sweden                 5
Netherlands            4
Fiji                   4
Mexico                 4
Amsterdam              4
Ireland                4
Puerto Rico            4
