In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# Read CSV into pandas dataframe.
df = pd.read_csv('trans_prices_to_growers.csv')

In [3]:
# Read country code data
df_countries = pd.read_csv('../../../DataHub/country_codes/data.csv')

## Peek Analysis
It is immediately evident the transpose in excel created some minor problems: 

(a) the data set contains columns intended for naming elements in sets (i.e. Colombia, Kenya, and Tanzania all belong with 'Colombian Milds.').

(b) there are unnamed rows with nothing in them.

(c) there are columns with names appended .1 (i.e. Papua New Guinea.1) which represent special types of coffee (soluble).

In [4]:
# Preview df
df.head()

Unnamed: 0,Calendar years,Unnamed: 1,Colombian Milds,Colombia,Kenya,Tanzania,Unnamed: 6,Other Milds,Bolivia (Plurinational State of),Burundi,...,Papua New Guinea.1,Philippines.1,Sierra Leone,Sri Lanka.1,Tanzania.1,Thailand.1,Togo,Trinidad & Tobago,Uganda.1,Viet Nam.1
0,1990,,,69.52,60.57,40.82,,,51.78,58.02,...,26.6,34.23,26.17,26.9,17.01,37.72,29.23,62.59,7.54,36.92
1,1991,,,67.13,50.68,49.5,,,52.42,55.02,...,29.48,34.61,,25.29,17.11,18.37,28.64,62.59,11.84,30.73
2,1992,,,54.57,28.95,49.16,,,43.53,59.23,...,39.1,42.48,,31.85,14.84,20.3,29.83,69.29,8.95,30.71
3,1993,,,50.12,47.64,38.28,,,27.33,58.27,...,54.8,46.45,,28.89,11.88,18.29,22.64,72.2,11.77,34.53
4,1994,,,85.99,152.61,62.16,,,79.92,60.68,...,95.64,94.66,,71.99,24.0,18.79,25.99,64.14,41.66,86.01


In [5]:
# Dtypes look clean
df.dtypes

Calendar years         int64
Unnamed: 1           float64
Colombian Milds      float64
Colombia             float64
Kenya                float64
                      ...   
Thailand.1           float64
Togo                 float64
Trinidad & Tobago    float64
Uganda.1             float64
Viet Nam.1           float64
Length: 74, dtype: object

## Cleaning Data
### Investigating columns
#### Observations
1. Unnamed and other problem columns have dtype int64 but actually contain NaNs.
    
    a. We will want to drop Unnamed columns but keep columns referencing coffee type.

In [6]:
# Get column names and turn into a list for iteration
cols = list(df.columns)

count = 0
lst_garb = []

# Count and print list of unwanted column names
for col in cols:
    if "Unnamed" in col:
        count += 1
        lst_garb.append(col)
        
print(count)
print(lst_garb)

3
['Unnamed: 1', 'Unnamed: 6', 'Unnamed: 35']


In [7]:
# df1 dropped df cols
df1 = df.drop(columns=lst_garb)
df1.head()
len(df1.columns)

71

In [8]:
# Observed a blank column
df1 = df1.drop(columns=(' '))
len(df1.columns)

70

In [9]:
# assign cols with new df1 columns
cols = df1.columns

# 29 rows of data expected
exp_row = len(df1.index)

# Empty lists for iteration and appending
lst_col_drop = []
lst_col_miss = ['Calendar years']
lst_col_good = []

# Set of lists to create a df for missing
lst_key_miss = []
lst_val_miss = []

# Collect columns with missing data for a dataframe
lst_dict_miss = []

for idx, col in enumerate(cols):
    count_na = 0
    for row in df1[col]:
        if row == '' or pd.isnull(row):
            count_na += 1
            
    # Country has any missing values append
    if count_na > 0 and count_na < exp_row: 
        key = col
        val = count_na
        
        lst_key_miss.append(key)
        lst_val_miss.append(val)
        
        lst_col_miss.append(col)
        
        print(f"""
        x{'-'*4}(Missing Values){'-'*4}x
        {key} : {val}
        """)
        
    # Column only has missing values append
    if count_na == exp_row:
        key = col
        val = count_na
        
        lst_col_drop.append(col)
        
        print(f"""
        #{'-'*4}(Region Category){'-'*4}#
        {key} : {val}
        """)
    
    # Country has no missing values append
    if count_na == 0:
        key = col
        val = count_na
        
        lst_col_good.append(col)
        
        print(f"""
        ${'-'*4}(Good Series){'-'*4}$
        {key} : {val}
        """)

    
print(f"""
=====
Summary
-----
{len(cols)} total columns.

{len(lst_col_drop)} columns to drop.
{len(lst_key_miss)} columns with missing data.
{len(lst_col_good)} columns with complete data.
""")


        $----(Good Series)----$
        Calendar years : 0
        

        #----(Region Category)----#
        Colombian Milds : 29
        

        $----(Good Series)----$
        Colombia : 0
        

        x----(Missing Values)----x
        Kenya : 14
        

        x----(Missing Values)----x
        Tanzania : 10
        

        #----(Region Category)----#
        Other Milds : 29
        

        x----(Missing Values)----x
        Bolivia (Plurinational State of) : 11
        

        x----(Missing Values)----x
        Burundi : 8
        

        x----(Missing Values)----x
        Cameroon : 10
        

        x----(Missing Values)----x
        Costa Rica : 1
        

        x----(Missing Values)----x
        Cuba : 1
        

        x----(Missing Values)----x
        Democratic Republic of Congo : 23
        

        $----(Good Series)----$
        Dominican Republic : 0
        

        x----(Missing Values)----x
        Ecuador : 4
        

        $---

=====
Summary
-----
67 total columns.

4 columns to drop.
49 columns with missing data.
13 columns with complete data.

In [10]:
# df1 = df1.copy()
# list = df1.columns
                  
# res = [sub.replace(' ', '_') for sub in list] 
# df1.columns = res
# df1.columns

In [11]:
# Assign regions with missing values to df
df_missing_list = pd.DataFrame(data=({
    'Country': lst_key_miss, 
    'Missing_Rows': lst_val_miss
}))

In [12]:
# Filter into dataframes by lists created

# List and count of missing rows by region
df_missing_list = df_missing_list.sort_values('Missing_Rows', ascending=False).reset_index(drop=True)

# Complete data
df_good = df1[lst_col_good]

# Missing data
df_missing = df1[lst_col_miss]

# Columns with coffee type or header information
df_drop = df1[lst_col_drop]

In [13]:
# Soluble extraction for complete data
# Get column names and turn into a list for iteration

def extract_sol(df,string):
    f"""
    Takes in dataframe, checks column names against substring,
    and filters and drops.
    """
    
    cols = list(df.columns)

    count = 0
    lst = []

    # Count and print list of unwanted column names
    for col in cols:
        if string in col:
            count += 1
            lst.append(col)

    print(count)
    print(lst)

    # Drop soluble
    df = df.drop(columns=(lst))
    return df

def store_sol(df,string):
    f"""
    Takes in dataframe, checks column names against substring,
    and filters and returns dataframe filtered by stored.
    """
    
    cols = list(df.columns)

    count = 0
    lst = []

    # Count and print list of unwanted column names
    for col in cols:
        if string in col:
            count += 1
            lst.append(col)

    print(count)
    print(lst)

    return df[lst]

In [14]:
# Function drops soluble values
df_good = extract_sol(df_good,".1")

3
['Brazil.1', 'India.1', 'Uganda.1']


In [15]:
df_missing = extract_sol(df_missing,".1")

13
['Angola.1', 'Burundi.1', 'Cameroon.1', 'Democratic Republic of Congo.1', 'Ecuador.1', 'Indonesia.1', 'Madagascar.1', 'Papua New Guinea.1', 'Philippines.1', 'Sri Lanka.1', 'Tanzania.1', 'Thailand.1', 'Viet Nam.1']


In [16]:
df_join = df_good.join(df_missing, lsuffix='_dup')
df_join_col = pd.DataFrame(list(df_join.columns), columns=['Country'])
df_join_col.to_csv('grow_col_names.csv', index=False)

In [17]:
# Save to csv
df_missing_list.to_csv('grow_missing_data_list.csv')

In [18]:
# Save to csv
df_good.to_csv('grow_complete_data.csv')

In [19]:
df_missing.to_csv('grow_missing_data.csv')

In [20]:
df_drop.to_csv('grow_category_data.csv')