## Clean up script for arabica_ratings_raw. 
Decisions arround data management will be recorded here.


In [1]:
# Dependencies
import pandas as pd
import numpy as np
import re

In [2]:
# Read CSV into pandas dataframe.
df = pd.read_csv('arabica_ratings_raw.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,quality_score,view_certificate_1,view_certificate_2,Cupping Protocol and Descriptors,View Green Analysis Details,Request a Sample,Species,Owner,Country of Origin,...,Quakers,Color,Category Two Defects,NA.3,Expiration,Certification Body,Certification Address,Certification Contact,Unnamed: 51,Notes
0,0,90.58,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,0 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
1,1,89.92,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,1 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
2,2,89.75,,,,,,Arabica,Grounds for Health Admin,Guatemala,...,0.0,,0 full defects,,"May 31st, 2011",Specialty Coffee Association,"117 W 4th St, Suite 300 Santa Ana, CA 92701",Chris Buck - (562) 624-4100,,
3,3,89.0,,,,,,Arabica,Yidnekachew Dabessa,Ethiopia,...,0.0,Green,2 full defects,,"March 25th, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
4,4,88.83,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,2 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,


In [4]:
df.dtypes

Unnamed: 0                            int64
quality_score                       float64
view_certificate_1                  float64
view_certificate_2                  float64
Cupping Protocol and Descriptors    float64
View Green Analysis Details         float64
Request a Sample                    float64
Species                              object
Owner                                object
Country of Origin                    object
Farm Name                            object
Lot Number                           object
Mill                                 object
ICO Number                           object
Company                              object
Altitude                             object
Region                               object
Producer                             object
Number of Bags                        int64
Bag Weight                           object
In-Country Partner                   object
Harvest Year                         object
Grading Date                    

In [5]:
## Drop columns, view_certificate_1, view_certificate_2, Cupping Protocol and Descriptors, View Green Analysis Details, Request a Sample. No data

df1 = df.drop(columns =['view_certificate_1','view_certificate_2','Cupping Protocol and Descriptors','View Green Analysis Details','Request a Sample','NA.1','NA.2','NA.3'])

In [6]:
## Scan for N/A and convert to blank
## Count number of blanks per column 
df1=df1.replace('n/a',"",regex=True)
df1=df1.replace(np.nan,"",regex=True)


In [7]:
df1= df1.drop(columns =['Unnamed: 0'])

In [8]:
df1.index.name = 'ID'


In [9]:
## quality score convert to int
df1.astype({'quality_score':'int32'}).dtypes

quality_score              int32
Species                   object
Owner                     object
Country of Origin         object
Farm Name                 object
Lot Number                object
Mill                      object
ICO Number                object
Company                   object
Altitude                  object
Region                    object
Producer                  object
Number of Bags             int64
Bag Weight                object
In-Country Partner        object
Harvest Year              object
Grading Date              object
Owner.1                   object
Variety                   object
Status                    object
Processing Method         object
NA                        object
Aroma                    float64
Flavor                   float64
Aftertaste               float64
Acidity                  float64
Body                     float64
Balance                  float64
Uniformity               float64
Clean Cup                float64
Sweetness 

In [10]:
df2 = df1.copy()
df3 = df1.copy()

In [11]:
## remove lbs/kgs before counter
counter = -1
        
for row in df3['Bag Weight']:
   counter += 1
   if "kg" in row and "lbs" in row:
        df3.iloc[counter, 13] = ""

In [12]:
## list keeping track of cells with lbs for conversion later
counter = 0
idx_lst = []
for idx, row in enumerate(df3['Bag Weight']):
    if "lbs" in row:
        counter += 1

        idx_lst.append(idx)

In [13]:
counter = -1
        
for row in df2['Bag Weight']:
   counter += 1
   if "kg" in row and "lbs" in row: 
        df2.iloc[counter, 13] = 0

In [14]:
## Bag Weight convert to all kg. Drop all letters and make int
# if the cell has lbs and kg make the cell empty

counter = -1
        
for row in df2['Bag Weight']:
   counter += 1
   if row ==0:
       continue
   if "kg" in row:
        fixed = re.sub(r'[a-z]+','',row,re.I) 
        df2.iloc[counter, 13] = fixed
   if "lbs" in row:
        fixed= re.sub(r'[a-z]+','',row,re.I)
        df2.iloc[counter, 13] = fixed


In [15]:
#type conversion

df2['Bag Weight'] = df2['Bag Weight'].astype(int)

In [16]:
## convert lbs to kg
for i in idx_lst:
   df2.iloc[i,13]=df2.iloc[i,13]*.453592



In [17]:
df2.to_csv('testing.csv')

In [18]:
df2.dtypes

quality_score            float64
Species                   object
Owner                     object
Country of Origin         object
Farm Name                 object
Lot Number                object
Mill                      object
ICO Number                object
Company                   object
Altitude                  object
Region                    object
Producer                  object
Number of Bags             int64
Bag Weight               float64
In-Country Partner        object
Harvest Year              object
Grading Date              object
Owner.1                   object
Variety                   object
Status                    object
Processing Method         object
NA                        object
Aroma                    float64
Flavor                   float64
Aftertaste               float64
Acidity                  float64
Body                     float64
Balance                  float64
Uniformity               float64
Clean Cup                float64
Sweetness 

## Altitude String Observations

### Observed
1. Since this data collects from bilingual participants, abbreviations differ. 
    - MSNM: Spanish — meters above sea level
    - MASL: English — meters above sea level
    - F: English — feet
    - 公尺: Chinese — meter
2. There is the appearance of ranges (i.e. 640m-1400m).
    - This is difficult because it requires making a decision about the data. Maybe we set the data for ranges in separate columns.
    
### Conclusions
The majority of the data is in meters, so it will be our goal to convert all values to meters, therefore:
1. First, cleaned of any debris (non-values).
2. Second, the data must be sorted to find values that are not meters and convert them.
3. Third, the data must be split by lower and upper ranges.

### Results
After printing using a for loop and if statements, it was discovered:
<br>
 - 'm' is present 360 times. <br>
 - 'f' is present 26 times. <br>
 - Otherwise, a number is present 700 times. <br>
 - Garbage present 226 times. <br> <br>

Total count: 1312 <br>
Expected count: 1312

In [19]:
# Get length of index for reference
len_index = len(df2.index)

## Cleaning Altitude Code
The following code iterates over each element in 'Altitude' and checks it against some rules. Does it contain 'm' for meter, 'f' for foot, neither? Once checked, it appends a corrected string into a corresponding new column for replacement.

In [20]:
##Fix altitude data

# Function for checking string for number
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

# Assign alt column to variable (as series)
alt = df2['Altitude']
count_y = 0
count_y_num = 0
count_y_f = 0
count_garb = 0

str_meter = []
str_feet = []
str_garb = []
str_num = []

# New column for appending
canon_col = []

for idx, elt in enumerate(alt):
    # if element contains m (suggesting meters)
    if 'm' in elt.lower() or '公尺' in elt.lower():
        count_y += 1
        
        ### --- Begin Canonincalizing --- ###
        elt_new = ""
        elt_garb = ""

        # Print for user
        print(f"""
        ---(String Conversion)---
        Working on [{elt}]
        -------------------------
        """)

        # Itertate over each character in a string
        for char in elt:

            # If character is numeric, add to new string in elt
            if char.isdigit() or "-" in char:
                elt_new += char

            # If not, add to garbage variable
            else:
                elt_garb += char

        elt_new = elt_new + " meters"

        print(f"""
        Digit result: {elt_new}
        Non-digit result: {elt_garb}
        """)

        # Append canon_col with new string
        canon_col.append(elt_new)

        ### --- End Canonicalizing --- ###
        
    # if not, it must be a different type
    else:
        
        # if element has numbers, what kind?
        if hasNumbers(elt):
            
            # are they feet?
            if 'f' in elt.lower():
                count_y_f += 1
                str_feet.append(elt)
                
                ### --- Begin Canonincalizing --- ###
                elt_new = ""
                elt_garb = ""

                # Print for user
                print(f"""
                ---(String Conversion)---
                Working on [{elt}]
                -------------------------
                """)

                # Itertate over each character in a string
                for char in elt:

                    # If character is numeric, add to new string in elt
                    if char.isdigit() or "-" in char:
                        elt_new += char

                    # If not, add to garbage variable
                    else:
                        elt_garb += char
                        
                elt_new = elt_new + " feet"

                print(f"""
                Digit result: {elt_new}
                Non-digit result: {elt_garb}
                """)
                
                # Append canon_col with new string
                canon_col.append(elt_new)
                
                ### --- End Canonicalizing --- ###
                
            # are they other?
            if 'f' not in elt.lower():
                count_y_num += 1
                
                ### --- Begin Canonincalizing --- ###
                elt_new = ""
                elt_garb = ""

                # Print for user
                print(f"""
                ---(String Conversion)---
                Working on [{elt}]
                -------------------------
                """)

                # Itertate over each character in a string
                for char in elt:

                    # If character is numeric, add to new string in elt
                    if char.isdigit() or "-" in char:
                        elt_new += char

                    # If not, add to garbage variable
                    else:
                        elt_garb += char
               
                # Try checking if value is metric
                try:
                    if int(elt_new) > 3937:
                        elt_new = elt_new + " feet"
                    else:
                        elt_new = elt_new + " meters"
                    print(f"""
                    Digit result: {elt_new}
                    Non-digit result: {elt_garb}
                    """)
                    
                # On values that error, add 'range'
                except:
                    elt_new = elt_new + " range"
                    print(f"!!! — Was not able to convert range: [{elt_new}]")

                
                # Append canon_col with new string
                canon_col.append(elt_new)
                
                ### --- End Canonicalizing --- ###
        else:
            count_garb += 1
            canon_col.append(elt)

q_sum = count_y+count_y_f+count_y_num+count_garb

if q_sum != len_index or len(canon_col) != len_index:
    print("ERROR")
else:
    print("----- SUCCESS -----")
            
print(f"""
Meters is present {count_y} times.
Feet is present {count_y_f} times.
Otherwise, number is present {count_y_num} times.
Garbage present {count_garb} times.
---
Process count: {count_y+count_y_f+count_y_num+count_garb}
Canon count (canon_col): {len(canon_col)}
Expected count: {len_index}
""")


                ---(String Conversion)---
                Working on [1950-2200]
                -------------------------
                
!!! — Was not able to convert range: [1950-2200 range]

                ---(String Conversion)---
                Working on [1950-2200]
                -------------------------
                
!!! — Was not able to convert range: [1950-2200 range]

        ---(String Conversion)---
        Working on [1600 - 1800 m]
        -------------------------
        

        Digit result: 1600-1800 meters
        Non-digit result:    m
        

                ---(String Conversion)---
                Working on [1800-2200]
                -------------------------
                
!!! — Was not able to convert range: [1800-2200 range]

                ---(String Conversion)---
                Working on [1950-2200]
                -------------------------
                
!!! — Was not able to convert range: [1950-2200 range]

                ---(St


        Digit result: 250 meters
        Non-digit result:  M
        

                ---(String Conversion)---
                Working on [1300]
                -------------------------
                

                    Digit result: 1300 meters
                    Non-digit result: 
                    

                ---(String Conversion)---
                Working on [1300]
                -------------------------
                

                    Digit result: 1300 meters
                    Non-digit result: 
                    

                ---(String Conversion)---
                Working on [1250]
                -------------------------
                

                    Digit result: 1250 meters
                    Non-digit result: 
                    

                ---(String Conversion)---
                Working on [3845]
                -------------------------
                

                    Digit result: 3845 meters
                

In [21]:
canon_col

['1950-2200 range',
 '1950-2200 range',
 '1600-1800 meters',
 '1800-2200 range',
 '1950-2200 range',
 '',
 '',
 '1570-1700 range',
 '1570-1700 range',
 '1795-1850 range',
 '1855-1955 range',
 '1872 meters',
 '1943 meters',
 '2000 feet',
 '1570-1700 range',
 '2080 meters',
 '1200-1800 meters',
 '',
 '1450 meters',
 '1700-2000 meters',
 '2019 meters',
 '1300 meters',
 '1320 meters',
 '2112 meters',
 '',
 '1250 meters',
 '',
 '1950 meters',
 '1400 meters',
 '1200 meters',
 '',
 '',
 '1300 meters',
 '1300 meters',
 '1750-1800 range',
 '1800 meters',
 '',
 '',
 '',
 '1941 meters',
 '1300 meters',
 '12 meters',
 '12 meters',
 '1000 meters',
 '1754 meters',
 '1250 meters',
 '',
 '900-1500 meters',
 '1520-2200 meters',
 '1400-1900 meters',
 '1500-2000 meters',
 '1400 feet',
 '',
 '1400-1900 range',
 '1800 meters',
 '1600 meters',
 '1800-2000 range',
 '5000 feet',
 '4650 feet',
 '',
 '',
 '1700 meters',
 '1500 meters',
 '13001400 meters',
 '',
 '1680 meters',
 '1900 meters',
 '1800-2000 range',

In [22]:
# test = ['1,000 meters', '999-2000', 'Nothing really']
# canon_col = []

# # Iterate over each element in the list

    
#     for string in list:
        
#         print(string)
#         # Assign empty variables for new values and garbage
#         elt_new = ""
#         elt_garb = ""

#         # Print for user
#         print(f"""Working on {string}""")

#         # Itertate over each character in a string
#         for char in string:

#             # If character is numeric, add to new string in elt
#             if char.isdigit() or "-" in char:
#                 elt_new += char

#             # If not, add to garbage variable
#             else:
#                 elt_garb += char

#         print(f"""
#         Digit result: {elt_new}
#         Non-digit result: {elt_garb}
#         """)

        
            
# str_to_digit(test)


# # res = [int(i) for i in test_string.split() if i.isdigit()] 
# # res

In [23]:
## harvest year convert to YYYY format for latest year