In [24]:
# Dependencies
import pandas as pd
import numpy as np

In [27]:
# dfr is a complete list of countries and continent codes
pd.read_csv('../data_sources/DataHub/')

FileNotFoundError: [Errno 2] File DataHub/country-and-continent-codes-list_zip/data.csv does not exist: 'DataHub/country-and-continent-codes-list_zip/data.csv'

In [2]:
# Read CSV into pandas dataframe.
df = pd.read_csv('trans_prices_to_growers.csv')

## Peek Analysis
It is immediately evident the transpose in excel created some minor problems: 

(a) the data set contains columns intended for naming elements in sets (i.e. Colombia, Kenya, and Tanzania all belong with 'Colombian Milds.').

(b) there are unnamed rows with nothing in them.

(c) there are columns with names appended .1 (i.e. Papua New Guinea.1) which represent special types of coffee (soluble).

In [3]:
# Preview df
df.head()

Unnamed: 0,Calendar years,Unnamed: 1,Colombian Milds,Colombia,Kenya,Tanzania,Unnamed: 6,Other Milds,Bolivia (Plurinational State of),Burundi,...,Papua New Guinea.1,Philippines.1,Sierra Leone,Sri Lanka.1,Tanzania.1,Thailand.1,Togo,Trinidad & Tobago,Uganda.1,Viet Nam.1
0,1990,,,69.52,60.57,40.82,,,51.78,58.02,...,26.6,34.23,26.17,26.9,17.01,37.72,29.23,62.59,7.54,36.92
1,1991,,,67.13,50.68,49.5,,,52.42,55.02,...,29.48,34.61,,25.29,17.11,18.37,28.64,62.59,11.84,30.73
2,1992,,,54.57,28.95,49.16,,,43.53,59.23,...,39.1,42.48,,31.85,14.84,20.3,29.83,69.29,8.95,30.71
3,1993,,,50.12,47.64,38.28,,,27.33,58.27,...,54.8,46.45,,28.89,11.88,18.29,22.64,72.2,11.77,34.53
4,1994,,,85.99,152.61,62.16,,,79.92,60.68,...,95.64,94.66,,71.99,24.0,18.79,25.99,64.14,41.66,86.01


In [4]:
# Dtypes look clean
df.dtypes

Calendar years         int64
Unnamed: 1           float64
Colombian Milds      float64
Colombia             float64
Kenya                float64
                      ...   
Thailand.1           float64
Togo                 float64
Trinidad & Tobago    float64
Uganda.1             float64
Viet Nam.1           float64
Length: 74, dtype: object

## Cleaning Data
### Investigating columns
#### Observations
1. Unnamed and other problem columns have dtype int64 but actually contain NaNs.
    
    a. We will want to drop Unnamed columns but keep columns referencing coffee type.

In [5]:
# Get column names and turn into a list for iteration
cols = list(df.columns.astype(str))

count = 0
lst_garb = []

# Count and print list of unwanted column names
for col in cols:
    if "Unnamed" in col:
        count += 1
        lst_garb.append(col)
        
print(count)
print(lst_garb)



3
['Unnamed: 1', 'Unnamed: 6', 'Unnamed: 35']


In [6]:
# Drop rows further based on observation below
df = df.drop(columns=['Venezuela', 'Zimbabwe', 'Liberia'])

In [7]:
# df1 dropped df cols
df1 = df.drop(columns=lst_garb)
df1.head()

Unnamed: 0,Calendar years,Colombian Milds,Colombia,Kenya,Tanzania,Other Milds,Bolivia (Plurinational State of),Burundi,Cameroon,Costa Rica,...,Papua New Guinea.1,Philippines.1,Sierra Leone,Sri Lanka.1,Tanzania.1,Thailand.1,Togo,Trinidad & Tobago,Uganda.1,Viet Nam.1
0,1990,,69.52,60.57,40.82,,51.78,58.02,32.29,58.49,...,26.6,34.23,26.17,26.9,17.01,37.72,29.23,62.59,7.54,36.92
1,1991,,67.13,50.68,49.5,,52.42,55.02,28.23,52.73,...,29.48,34.61,,25.29,17.11,18.37,28.64,62.59,11.84,30.73
2,1992,,54.57,28.95,49.16,,43.53,59.23,28.76,49.91,...,39.1,42.48,,31.85,14.84,20.3,29.83,69.29,8.95,30.71
3,1993,,50.12,47.64,38.28,,27.33,58.27,23.25,54.27,...,54.8,46.45,,28.89,11.88,18.29,22.64,72.2,11.77,34.53
4,1994,,85.99,152.61,62.16,,79.92,60.68,88.32,81.36,...,95.64,94.66,,71.99,24.0,18.79,25.99,64.14,41.66,86.01


In [8]:
# Clean based on code below
df1 = df1.drop(columns=(' '))

In [9]:
#### assign cols with new df1 columns
cols = df1.columns[1:]
# cols = cols[1:]

# 29 rows of data expected
exp_row = len(df1.index)

lst_col_drop = []
lst_col_miss = []
lst_col_good = []
col_miss = []

lst_key_miss = []
lst_val_miss = []

# Collect columns with missing data for a dataframe.
lst_dict_miss = []

for idx, col in enumerate(cols):
    count_na = 0
    for row in df1[col]:
        if row == '' or pd.isnull(row):
            count_na += 1
    if count_na > 0 and count_na < exp_row:
        key = col
        val = count_na
        
        lst_key_miss.append(key)
        lst_val_miss.append(val)  
        
        print(f"""
        {'-'*5}(Missing Values){'-'*5}
        {key} : {val}
        """)
    if count_na == exp_row:
        key = col
        val = count_na
        
        lst_col_drop.append(col)
        
        print(f"""
        #{'-'*4}(Region Category){'-'*4}#
        {key} : {val}
        """)


        #----(Region Category)----#
        Colombian Milds : 29
        

        -----(Missing Values)-----
        Kenya : 14
        

        -----(Missing Values)-----
        Tanzania : 10
        

        #----(Region Category)----#
        Other Milds : 29
        

        -----(Missing Values)-----
        Bolivia (Plurinational State of) : 11
        

        -----(Missing Values)-----
        Burundi : 8
        

        -----(Missing Values)-----
        Cameroon : 10
        

        -----(Missing Values)-----
        Costa Rica : 1
        

        -----(Missing Values)-----
        Cuba : 1
        

        -----(Missing Values)-----
        Democratic Republic of Congo : 23
        

        -----(Missing Values)-----
        Ecuador : 4
        

        -----(Missing Values)-----
        Haiti : 18
        

        -----(Missing Values)-----
        Jamaica : 6
        

        -----(Missing Values)-----
        Madagascar : 18
        

        -----(Missi

In [10]:
# Assign regions with missing values to df
df_missing = pd.DataFrame(data=({
    'Country': lst_key_miss, 
    'Missing_Rows': lst_val_miss
}))

df_missing.head()

Unnamed: 0,Country,Missing_Rows
0,Kenya,14
1,Tanzania,10
2,Bolivia (Plurinational State of),11
3,Burundi,8
4,Cameroon,10
