# Coffee Quality Visualizations

## Import Packages and Data

In [1]:
# Import Packages
import pandas as pd
import plotly_express as px

In [17]:
# Import Data
coffee = pd.read_excel('/Users/kellyshreeve/Desktop/Data-Sets/df_arabica_clean.xlsx', header=0, index_col=0)

In [18]:
# Print data info 
print(coffee.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207 entries, 0 to 206
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     207 non-null    int64  
 1   Country of Origin      207 non-null    object 
 2   Farm Name              205 non-null    object 
 3   Lot Number             206 non-null    object 
 4   Mill                   204 non-null    object 
 5   ICO Number             75 non-null     object 
 6   Company                207 non-null    object 
 7   Altitude               206 non-null    object 
 8   Region                 205 non-null    object 
 9   Producer               206 non-null    object 
 10  Number of Bags         207 non-null    int64  
 11  Bag Weight             207 non-null    object 
 12  In-Country Partner     207 non-null    object 
 13  Harvest Year           207 non-null    object 
 14  Grading Date           207 non-null    object 
 15  Owner 

In [22]:
# Print data head
print(coffee.head(10).to_string())

   ID             Country of Origin                                                 Farm Name                               Lot Number                           Mill                         ICO Number                   Company   Altitude                Region                      Producer  Number of Bags Bag Weight                              In-Country Partner Harvest Year          Grading Date                                           Owner      Variety     Status        Processing Method  Aroma  Flavor  Aftertaste  Acidity  Body  Balance  Uniformity  Clean Cup  Sweetness  Overall  Defects  Total Cup Points  Moisture Percentage  Category One Defects  Quakers         Color  Category Two Defects            Expiration                              Certification Body                                                                                         Certification Address                                Certification Contact
0   0                      Colombia                          

### Import Data Conclusions

Most variables have complete information. Missing a small number of observations are: Farm Name, Lot Number, Mill, Altitute, Region, Producer, Variety, and Processing Method. ICO Number is missing a large number of observations and only present for 75 of the rows. Altitude, Bag Weight, and Harvest Year can be changed to integer data type, Grading Date and Expiration can be changed to datetime. Column names need to be converted to snake case.

## Prepare Data

### Column Names

In [26]:
# Convert column names to snake case
coffee.columns = coffee.columns.str.lower().str.replace(' ', '_')


print(coffee.columns)

Index(['id', 'country_of_origin', 'farm_name', 'lot_number', 'mill',
       'ico_number', 'company', 'altitude', 'region', 'producer',
       'number_of_bags', 'bag_weight', 'in-country_partner', 'harvest_year',
       'grading_date', 'owner', 'variety', 'status', 'processing_method',
       'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance',
       'uniformity', 'clean_cup', 'sweetness', 'overall', 'defects',
       'total_cup_points', 'moisture_percentage', 'category_one_defects',
       'quakers', 'color', 'category_two_defects', 'expiration',
       'certification_body', 'certification_address', 'certification_contact'],
      dtype='object')


### Data Types

### Missing Values

In [27]:
# Display number missing in each column
missing_values = coffee.isna().sum()

display(missing_values)

id                         0
country_of_origin          0
farm_name                  2
lot_number                 1
mill                       3
ico_number               132
company                    0
altitude                   1
region                     2
producer                   1
number_of_bags             0
bag_weight                 0
in-country_partner         0
harvest_year               0
grading_date               0
owner                      0
variety                    6
status                     0
processing_method          5
aroma                      0
flavor                     0
aftertaste                 0
acidity                    0
body                       0
balance                    0
uniformity                 0
clean_cup                  0
sweetness                  0
overall                    0
defects                    0
total_cup_points           0
moisture_percentage        0
category_one_defects       0
quakers                    0
color         

In [29]:
# Replace all missing string values with 'unkown'
str_cols = ['farm_name', 'lot_number', 'mill', 'ico_number', 'region', 'producer', 'variety', 'processing_method']

for col in str_cols:
    coffee[col] = coffee[col].fillna('unkown')

In [30]:
missing_values_new = coffee.isna().sum()

display(missing_values_new)

id                       0
country_of_origin        0
farm_name                0
lot_number               0
mill                     0
ico_number               0
company                  0
altitude                 1
region                   0
producer                 0
number_of_bags           0
bag_weight               0
in-country_partner       0
harvest_year             0
grading_date             0
owner                    0
variety                  0
status                   0
processing_method        0
aroma                    0
flavor                   0
aftertaste               0
acidity                  0
body                     0
balance                  0
uniformity               0
clean_cup                0
sweetness                0
overall                  0
defects                  0
total_cup_points         0
moisture_percentage      0
category_one_defects     0
quakers                  0
color                    0
category_two_defects     0
expiration               0
c

### Duplicates

In [33]:
# Check for full duplicates
duplicates = coffee.duplicated.sum()

print(f'The number of full duplicates is: {duplicates}')

AttributeError: 'function' object has no attribute 'sum'