In [17]:
# Dependencies
import pandas as pd
import numpy as np

In [18]:
# # dfr is a complete list of countries and continent codes
# pd.read_csv('../data_sources/DataHub/')

In [19]:
# Read CSV into pandas dataframe.
df = pd.read_csv('trans_prices_to_growers.csv')

## Peek Analysis
It is immediately evident the transpose in excel created some minor problems: 

(a) the data set contains columns intended for naming elements in sets (i.e. Colombia, Kenya, and Tanzania all belong with 'Colombian Milds.').

(b) there are unnamed rows with nothing in them.

(c) there are columns with names appended .1 (i.e. Papua New Guinea.1) which represent special types of coffee (soluble).

In [20]:
# Preview df
df.head()

Unnamed: 0,Calendar years,Unnamed: 1,Colombian Milds,Colombia,Kenya,Tanzania,Unnamed: 6,Other Milds,Bolivia (Plurinational State of),Burundi,...,Papua New Guinea.1,Philippines.1,Sierra Leone,Sri Lanka.1,Tanzania.1,Thailand.1,Togo,Trinidad & Tobago,Uganda.1,Viet Nam.1
0,1990,,,69.52,60.57,40.82,,,51.78,58.02,...,26.6,34.23,26.17,26.9,17.01,37.72,29.23,62.59,7.54,36.92
1,1991,,,67.13,50.68,49.5,,,52.42,55.02,...,29.48,34.61,,25.29,17.11,18.37,28.64,62.59,11.84,30.73
2,1992,,,54.57,28.95,49.16,,,43.53,59.23,...,39.1,42.48,,31.85,14.84,20.3,29.83,69.29,8.95,30.71
3,1993,,,50.12,47.64,38.28,,,27.33,58.27,...,54.8,46.45,,28.89,11.88,18.29,22.64,72.2,11.77,34.53
4,1994,,,85.99,152.61,62.16,,,79.92,60.68,...,95.64,94.66,,71.99,24.0,18.79,25.99,64.14,41.66,86.01


In [21]:
# Dtypes look clean
df.dtypes

Calendar years         int64
Unnamed: 1           float64
Colombian Milds      float64
Colombia             float64
Kenya                float64
                      ...   
Thailand.1           float64
Togo                 float64
Trinidad & Tobago    float64
Uganda.1             float64
Viet Nam.1           float64
Length: 74, dtype: object

## Cleaning Data
### Investigating columns
#### Observations
1. Unnamed and other problem columns have dtype int64 but actually contain NaNs.
    
    a. We will want to drop Unnamed columns but keep columns referencing coffee type.

In [24]:
# Find the values that belong in a soluble coffee category
# And clean up label.
col = df.columns
col

res = [sub.replace('.1', '') for sub in col] 
df.columns = res
len(df.columns)

74

In [25]:
# Get column names and turn into a list for iteration
cols = list(res)

count = 0
lst_garb = []

# Count and print list of unwanted column names
for col in cols:
    if "Unnamed" in col:
        count += 1
        lst_garb.append(col)
        
print(count)
print(lst_garb)

len(res)

3
['Unnamed: 1', 'Unnamed: 6', 'Unnamed: 35']


74

In [8]:
# df1 dropped df cols
df1 = df.drop(columns=lst_garb)
df1.head()
len(df1.columns)

71

In [9]:
# Observed a blank column
df1 = df1.drop(columns=(' '))
len(df1.columns)

70

In [27]:
# assign cols with new df1 columns
cols = df1.columns

# 29 rows of data expected
exp_row = len(df1.index)

# Empty lists for iteration and appending
lst_col_drop = []
lst_col_miss = ['Calendar years']
lst_col_good = []

# Set of lists to create a df for missing
lst_key_miss = []
lst_val_miss = []

# Collect columns with missing data for a dataframe
lst_dict_miss = []

for idx, col in enumerate(cols):
    count_na = 0
    for row in df1[col]:
        if row == '' or pd.isnull(row):
            count_na += 1
            
    # Country has any missing values append
    if count_na > 0 and count_na < exp_row: 
        key = col
        val = count_na
        
        lst_key_miss.append(key)
        lst_val_miss.append(val)
        
        lst_col_miss.append(col)
        
        print(f"""
        x{'-'*4}(Missing Values){'-'*4}x
        {key} : {val}
        """)
        
    # Column only has missing values append
    if count_na == exp_row:
        key = col
        val = count_na
        
        lst_col_drop.append(col)
        
        print(f"""
        #{'-'*4}(Region Category){'-'*4}#
        {key} : {val}
        """)
    
    # Country has no missing values append
    if count_na == 0:
        key = col
        val = count_na
        
        lst_col_good.append(col)
        
        print(f"""
        ${'-'*4}(Good Series){'-'*4}$
        {key} : {val}
        """)

    
print(f"""
=====
Summary
-----
{len(cols)} total columns.

{len(lst_col_drop)} columns to drop.
{len(lst_key_miss)} columns with missing data.
{len(lst_col_good)} columns with complete data.
""")


        $----(Good Series)----$
        Calendar years : 0
        

        #----(Region Category)----#
        Colombian Milds : 29
        

        $----(Good Series)----$
        Colombia : 0
        

        x----(Missing Values)----x
        Kenya : 14
        

        $----(Good Series)----$
        Tanzania : 0
        

        #----(Region Category)----#
        Other Milds : 29
        

        x----(Missing Values)----x
        Bolivia (Plurinational State of) : 11
        

        $----(Good Series)----$
        Burundi : 0
        

        $----(Good Series)----$
        Cameroon : 0
        

        x----(Missing Values)----x
        Costa Rica : 1
        

        x----(Missing Values)----x
        Cuba : 1
        

        $----(Good Series)----$
        Democratic Republic of Congo : 0
        

        $----(Good Series)----$
        Dominican Republic : 0
        

        $----(Good Series)----$
        Ecuador : 0
        

        $----(Good Series)----

=====
Summary
-----
67 total columns.

4 columns to drop.
49 columns with missing data.
13 columns with complete data.

In [11]:
# Assign regions with missing values to df
df_missing_list = pd.DataFrame(data=({
    'Country': lst_key_miss, 
    'Missing_Rows': lst_val_miss
}))


df_missing_list.to_csv('prod_missing_data_list.csv')

In [28]:
# Save to csv
df_good = df1[lst_col_good]
df_good.to_csv('prod_complete_data.csv')

In [29]:
df_missing = df1[lst_col_miss]
df_missing.to_csv('prod_missing_data.csv')

In [30]:
df_drop = df1[lst_col_drop]
df_drop.to_csv('orginal_category_data.csv')

In [31]:
yes = 0
for col in lst_col_good:
    if col in df.columns:
        yes += 1
    print(df1[col])
        
df1.Tan

0     1990
1     1991
2     1992
3     1993
4     1994
5     1995
6     1996
7     1997
8     1998
9     1999
10    2000
11    2001
12    2002
13    2003
14    2004
15    2005
16    2006
17    2007
18    2008
19    2009
20    2010
21    2011
22    2012
23    2013
24    2014
25    2015
26    2016
27    2017
28    2018
Name: Calendar years, dtype: int64
0      69.52
1      67.13
2      54.57
3      50.12
4      85.99
5      99.62
6      93.56
7     133.17
8     102.08
9      86.38
10     74.96
11     57.95
12     52.57
13     48.34
14     60.83
15     89.22
16     89.81
17    100.05
18    114.22
19    138.96
20    180.55
21    239.68
22    166.69
23    113.91
24    159.71
25    119.40
26    123.56
27    125.76
28    113.52
Name: Colombia, dtype: float64
    Tanzania  Tanzania
0      40.82     17.01
1      49.50     17.11
2      49.16     14.84
3      38.28     11.88
4      62.16     24.00
5      86.76     44.61
6      62.53     32.20
7     118.52     92.48
8      90.70     28.19
9      6