## Clean up script for arabica_ratings_raw. 
Decisions arround data management will be recorded here.


In [1]:
# Dependencies
import pandas as pd
import numpy as np
import re

In [2]:
# Read CSV into pandas dataframe.
df = pd.read_csv('arabica_ratings_raw.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,quality_score,view_certificate_1,view_certificate_2,Cupping Protocol and Descriptors,View Green Analysis Details,Request a Sample,Species,Owner,Country of Origin,...,Quakers,Color,Category Two Defects,NA.3,Expiration,Certification Body,Certification Address,Certification Contact,Unnamed: 51,Notes
0,0,90.58,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,0 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
1,1,89.92,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,1 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
2,2,89.75,,,,,,Arabica,Grounds for Health Admin,Guatemala,...,0.0,,0 full defects,,"May 31st, 2011",Specialty Coffee Association,"117 W 4th St, Suite 300 Santa Ana, CA 92701",Chris Buck - (562) 624-4100,,
3,3,89.0,,,,,,Arabica,Yidnekachew Dabessa,Ethiopia,...,0.0,Green,2 full defects,,"March 25th, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
4,4,88.83,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,2 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,


In [4]:
df.dtypes

Unnamed: 0                            int64
quality_score                       float64
view_certificate_1                  float64
view_certificate_2                  float64
Cupping Protocol and Descriptors    float64
View Green Analysis Details         float64
Request a Sample                    float64
Species                              object
Owner                                object
Country of Origin                    object
Farm Name                            object
Lot Number                           object
Mill                                 object
ICO Number                           object
Company                              object
Altitude                             object
Region                               object
Producer                             object
Number of Bags                        int64
Bag Weight                           object
In-Country Partner                   object
Harvest Year                         object
Grading Date                    

In [5]:
## Drop columns, view_certificate_1, view_certificate_2, Cupping Protocol and Descriptors, View Green Analysis Details, Request a Sample. No data

df1 = df.drop(columns =['view_certificate_1','view_certificate_2','Cupping Protocol and Descriptors','View Green Analysis Details','Request a Sample','NA.1','NA.2','NA.3'])

In [6]:
## Scan for N/A and convert to blank
## Count number of blanks per column 
df1=df1.replace('n/a',"",regex=True)
df1=df1.replace(np.nan,"",regex=True)


In [7]:
df1= df1.drop(columns =['Unnamed: 0'])

In [8]:
df1.index.name = 'ID'


In [9]:
## quality score convert to int
df1.astype({'quality_score':'int32'}).dtypes

quality_score              int32
Species                   object
Owner                     object
Country of Origin         object
Farm Name                 object
Lot Number                object
Mill                      object
ICO Number                object
Company                   object
Altitude                  object
Region                    object
Producer                  object
Number of Bags             int64
Bag Weight                object
In-Country Partner        object
Harvest Year              object
Grading Date              object
Owner.1                   object
Variety                   object
Status                    object
Processing Method         object
NA                        object
Aroma                    float64
Flavor                   float64
Aftertaste               float64
Acidity                  float64
Body                     float64
Balance                  float64
Uniformity               float64
Clean Cup                float64
Sweetness 

In [10]:
df2 = df1.copy()
df3 = df1.copy()

In [11]:
## remove lbs/kgs before counter
counter = -1
        
for row in df3['Bag Weight']:
   counter += 1
   if "kg" in row and "lbs" in row:
        df3.iloc[counter, 13] = ""

In [12]:
## list keeping track of cells with lbs for conversion later
counter = 0
idx_lst = []
for idx, row in enumerate(df3['Bag Weight']):
    if "lbs" in row:
        counter += 1

        idx_lst.append(idx)

In [13]:
counter = -1
        
for row in df2['Bag Weight']:
   counter += 1
   if "kg" in row and "lbs" in row: 
        df2.iloc[counter, 13] = 0

In [14]:
## Bag Weight convert to all kg. Drop all letters and make int
# if the cell has lbs and kg make the cell empty

counter = -1
        
for row in df2['Bag Weight']:
   counter += 1
   if row ==0:
       continue
   if "kg" in row:
        fixed = re.sub(r'[a-z]+','',row,re.I) 
        df2.iloc[counter, 13] = fixed
   if "lbs" in row:
        fixed= re.sub(r'[a-z]+','',row,re.I)
        df2.iloc[counter, 13] = fixed


In [15]:
#type conversion

df2['Bag Weight'] = df2['Bag Weight'].astype(int)

In [16]:
## convert lbs to kg
for i in idx_lst:
   df2.iloc[i,13]=df2.iloc[i,13]*.453592



In [17]:
df2.to_csv('testing.csv')

In [18]:
df2.dtypes

quality_score            float64
Species                   object
Owner                     object
Country of Origin         object
Farm Name                 object
Lot Number                object
Mill                      object
ICO Number                object
Company                   object
Altitude                  object
Region                    object
Producer                  object
Number of Bags             int64
Bag Weight               float64
In-Country Partner        object
Harvest Year              object
Grading Date              object
Owner.1                   object
Variety                   object
Status                    object
Processing Method         object
NA                        object
Aroma                    float64
Flavor                   float64
Aftertaste               float64
Acidity                  float64
Body                     float64
Balance                  float64
Uniformity               float64
Clean Cup                float64
Sweetness 

## Altitude String Observations

### Observed
1. Since this data collects from bilingual participants, abbreviations differ. 
    - MSNM: Spanish — meters above sea level
    - MASL: English — meters above sea level
    - F: English — feet
    - 公尺: Chinese — meter
2. There is the appearance of ranges (i.e. 640m-1400m).
    - This is difficult because it requires making a decision about the data. Maybe we set the data for ranges in separate columns.
    
### Conclusions
The majority of the data is in meters, so it will be our goal to convert all values to meters, therefore:
1. First, cleaned of any debris (non-values).
2. Second, the data must be sorted to find values that are not meters and convert them.
3. Third, the data must be split by lower and upper ranges.

In [39]:
##Fix altitude data

# Function for checking string for number
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

# Assign alt column to variable (as series)
alt = df2['Altitude']
count_y = 0
count_y_num = 0
count_n = 0

str_meter = []
str_feet = []
str_garb = []
str_other = []

for idx, elt in enumerate(alt):
    if 'm' in elt.lower() or 'M' in elt.lower() or '公尺' in elt.lower():
        print(f'yes at {elt}')
        count_y += 1
        str_meter.append(elt)
    else:
        if hasNumbers(elt) and 'f' in elt:
            print(f'#--f no at {elt}')
            count_y_num += 1
        else:
            print(f'### no at {elt}')
print(f"""
M is present {count_y} times.
Number present {count_y_num} times.
M is NOT present {count_n} times.
""")

### no at 1950-2200
### no at 1950-2200
yes at 1600 - 1800 m
### no at 1800-2200
### no at 1950-2200
### no at 
### no at 
### no at 1570-1700
### no at 1570-1700
### no at 1795-1850
### no at 1855-1955
yes at Meters above sea level: 1.872
yes at Meters above sea level: 1.943
#--f no at 2000 ft
### no at 1570-1700
yes at Meters above sea level: 2.080
yes at 1200-1800m
### no at 
### no at 1450
yes at 1700-2000m
yes at Meters above sea level: 2.019
yes at 1300 MSNM
### no at 1320
yes at Meters above sea level: 2.112
### no at 
yes at 1250m
### no at 
### no at 1950
### no at 1400
### no at 1200
### no at 
### no at 
### no at 1300
### no at 1300
### no at 1750-1800
### no at 1800
### no at 
### no at 
### no at 
yes at Meters above sea level: 1.941
### no at 1300
### no at 1.2
### no at 1.2
yes at 1000 M
### no at 1754
yes at 1250m
### no at 
yes at 900-1,500m
yes at 1520m-2200m
yes at 1400-1900M
yes at 1500-2000 m
#--f no at 1400ft
### no at 
### no at 1400-1900
yes at 1800 msnm
### no

In [None]:
## harvest year convert to YYYY format for latest year