In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read CSV into pandas dataframe.
df = pd.read_csv('trans_retail_prices.csv')

# read region id data into pandas
df_countries = pd.read_csv('../../../DataHub/country_codes/data.csv')
df_countries.head()

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0


In [3]:
# Preview df
df.columns

Index(['Calendar years', 'Unnamed: 1', 'European Union', '   Austria',
       '   Belgium', '   Bulgaria', '   Cyprus', '   Czech Republic',
       '   Denmark', '   Finland', '   France', '   Germany', '   Hungary',
       '   Italy', '   Latvia', '   Lithuania', '   Luxembourg', '   Malta 1',
       '   Netherlands', '   Poland', '   Portugal', '   Slovakia',
       '   Slovenia', '   Spain', '   Sweden', '   United Kingdom 1',
       'Unnamed: 26', 'Japan', 'Norway', 'Russian Federation', 'Switzerland',
       'USA'],
      dtype='object')

In [4]:
df = df.rename(columns={'USA': 'United States'})

In [5]:
df.head()

Unnamed: 0,Calendar years,Unnamed: 1,European Union,Austria,Belgium,Bulgaria,Cyprus,Czech Republic,Denmark,Finland,...,Slovenia,Spain,Sweden,United Kingdom 1,Unnamed: 26,Japan,Norway,Russian Federation,Switzerland,United States
0,1990,,,4.9,3.27,,2.83,,3.81,2.98,...,,3.62,3.43,10.55,,10.26,3.31,,4.83,2.97
1,1991,,,4.57,2.92,,2.8,,3.67,2.72,...,,3.5,3.29,10.41,,11.88,3.16,,4.36,2.81
2,1992,,,4.99,3.05,,2.87,,3.79,2.46,...,,3.5,3.11,10.09,,12.62,3.12,,4.52,2.58
3,1993,,,4.97,2.78,,2.6,,3.48,1.94,...,,2.79,2.65,8.44,,14.57,2.46,,4.27,2.47
4,1994,,,4.58,3.42,,3.18,,4.45,2.73,...,,2.76,3.79,11.36,,14.69,3.29,,4.5,3.4


## Peek Analysis
It is immediately evident the transpose in excel created some minor problems: 

(a) the data set contains columns intended for naming elements in sets (i.e. 'European Union').

(b) there are unnamed rows with nothing in them.

(c) there are columns with names appended 1 (i.e. Malta 1) which represent special types of coffee (soluble).

In [6]:
# Dtypes look clean
df.dtypes

Calendar years           int64
Unnamed: 1             float64
European Union         float64
   Austria             float64
   Belgium             float64
   Bulgaria            float64
   Cyprus              float64
   Czech Republic      float64
   Denmark             float64
   Finland             float64
   France              float64
   Germany             float64
   Hungary             float64
   Italy               float64
   Latvia              float64
   Lithuania           float64
   Luxembourg          float64
   Malta 1             float64
   Netherlands         float64
   Poland              float64
   Portugal            float64
   Slovakia            float64
   Slovenia            float64
   Spain               float64
   Sweden              float64
   United Kingdom 1    float64
Unnamed: 26            float64
Japan                  float64
Norway                 float64
Russian Federation     float64
Switzerland            float64
United States          float64
dtype: o

In [7]:
# Clean up col names
df.columns = df.columns.str.replace("   ","")
df.columns = df.columns.str.replace(" 1",".1")
df.columns

Index(['Calendar years', 'Unnamed:.1', 'European Union', 'Austria', 'Belgium',
       'Bulgaria', 'Cyprus', 'Czech Republic', 'Denmark', 'Finland', 'France',
       'Germany', 'Hungary', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg',
       'Malta.1', 'Netherlands', 'Poland', 'Portugal', 'Slovakia', 'Slovenia',
       'Spain', 'Sweden', 'United Kingdom.1', 'Unnamed: 26', 'Japan', 'Norway',
       'Russian Federation', 'Switzerland', 'United States'],
      dtype='object')

## Cleaning Data
### Investigating columns
#### Observations
1. Unnamed and other problem columns have dtype int64 but actually contain NaNs.
    
    a. We will want to drop Unnamed columns but keep columns referencing coffee type.

In [8]:
# Get column names and turn into a list for iteration
cols = df.columns

count = 0
lst_garb = []

# Count and print list of unwanted column names
for col in cols:
    if "Unnamed" in col:
        count += 1
        lst_garb.append(col)
        
print(count)
print(lst_garb)

2
['Unnamed:.1', 'Unnamed: 26']


In [9]:
# df1 dropped df cols
df1 = df.drop(columns=lst_garb)
df1.head()
len(df1.columns)

30

In [10]:
# assign cols with new df1 columns
cols = df1.columns

# 29 rows of data expected
exp_row = len(df1.index)

# Empty lists for iteration and appending
lst_col_drop = []
lst_col_miss = ['Calendar years']
lst_col_good = []

# Set of lists to create a df for missing
lst_key_miss = []
lst_val_miss = []

# Collect columns with missing data for a dataframe
lst_dict_miss = []

for idx, col in enumerate(cols):
    count_na = 0
    for row in df1[col]:
        if row == '' or pd.isnull(row):
            count_na += 1
            
    # Country has any missing values append
    if count_na > 0 and count_na < exp_row: 
        key = col
        val = count_na
        
        lst_key_miss.append(key)
        lst_val_miss.append(val)
        
        lst_col_miss.append(col)
        
        print(f"""
        x{'-'*4}(Missing Values){'-'*4}x
        {key} : {val}
        """)
        
    # Column only has missing values append
    if count_na == exp_row:
        key = col
        val = count_na
        
        lst_col_drop.append(col)
        
        print(f"""
        #{'-'*4}(Region Category){'-'*4}#
        {key} : {val}
        """)
    
    # Country has no missing values append
    if count_na == 0:
        key = col
        val = count_na
        
        lst_col_good.append(col)
        
        print(f"""
        ${'-'*4}(Good Series){'-'*4}$
        {key} : {val}
        """)

    
print(f"""
=====
Summary
-----
{len(cols)} total columns.

{len(lst_col_drop)} columns to drop.
{len(lst_key_miss)} columns with missing data.
{len(lst_col_good)} columns with complete data.
""")


        $----(Good Series)----$
        Calendar years : 0
        

        #----(Region Category)----#
        European Union : 29
        

        $----(Good Series)----$
        Austria : 0
        

        x----(Missing Values)----x
        Belgium : 5
        

        x----(Missing Values)----x
        Bulgaria : 12
        

        $----(Good Series)----$
        Cyprus : 0
        

        x----(Missing Values)----x
        Czech Republic : 7
        

        $----(Good Series)----$
        Denmark : 0
        

        $----(Good Series)----$
        Finland : 0
        

        $----(Good Series)----$
        France : 0
        

        $----(Good Series)----$
        Germany : 0
        

        x----(Missing Values)----x
        Hungary : 4
        

        $----(Good Series)----$
        Italy : 0
        

        x----(Missing Values)----x
        Latvia : 3
        

        x----(Missing Values)----x
        Lithuania : 7
        

        x----(Missing Valu

=====
Summary
-----
30 total columns.

1 columns to drop.
14 columns with missing data.
15 columns with complete data.

In [11]:
# Assign regions with missing values to df
df_missing_list = pd.DataFrame(data=({
    'Country': lst_key_miss, 
    'Missing_Rows': lst_val_miss
}))

In [12]:
# df record of missing rows with counts by region
df_missing_list = df_missing_list.sort_values('Missing_Rows', ascending=False).reset_index(drop=True)

# df of complete data 
df_good = df1[lst_col_good]

# df of columns with missing data
df_missing = df1[lst_col_miss]

# df of columns dropped that contained header info
df_drop = df1[lst_col_drop]

In [13]:
# Soluble extraction for complete data
# Get column names and turn into a list for iteration

def extract_sol(df,string):
    f"""
    Takes in dataframe, checks column names against substring,
    and filters and drops.
    """
    
    cols = df.columns

    count = 0
    lst = []

    # Count and print list of unwanted column names
    for col in cols:
        if string in col:
            count += 1
            lst.append(col)

    print(count)
    print(lst)

    # Drop soluble
    df = df.drop(columns=(lst))
    return df

def store_sol(df,string):
    f"""
    Takes in dataframe, checks column names against substring,
    and filters and drops.
    """
    
    cols = df.columns

    count = 0
    lst = []

    # Count and print list of unwanted column names
    for col in cols:
        if string in col:
            count += 1
            lst.append(col)

    print(count)
    print(lst)
    
    return df[lst]

def underscoreCol(df):
    """
    Takes in a dataframe, replaces spaces with '_'
    """
    
    print(f"Changing {df.columns}...")
    list = df.columns

    res = [sub.replace(' ', '_') for sub in list] 
    df.columns = res 
    df.columns
    print(f"{df.columns}")
        
def region_conversion(df):
    
    f"""
    This function takes in dataframe columns
    and changes them to corresponding
    strings in an authoratative table.
    """
    
    lst_country_renamed = []
    lst_id = df_countries['Country_Name']
    new_name = ['Calendar years']

    for country in df.columns[1:]:
        # Keep track of each element appended
        last_elt = ""
        count = 0
        # For each element in countries...
        for id in df_countries['Country_Name']:
            # If the string country is not in the id string...
            # No duplicates
            if country not in new_name and country in id:
                print(f"""Match! | [{country}] and [{id}])""")
                print(f"""...marking last entry as [{country}]
                ---""")
                last_elt = country
                new_name.append(id)

                # Break loop when above if is true
                break
            if id not in new_name:
                count += 1
                if count == len(df_countries['Country_Name']) - 100:
                    print(f"""{country} in df.columns but not id.""")
                


    df.columns = new_name
    return df.columns

        
        

In [14]:
df_good = extract_sol(df_good,".1")
df_good.head()

1
['United Kingdom.1']


Unnamed: 0,Calendar years,Austria,Cyprus,Denmark,Finland,France,Germany,Italy,Netherlands,Portugal,Spain,Sweden,Japan,Norway
0,1990,4.9,2.83,3.81,2.98,3.73,3.99,5.31,3.03,4.12,3.62,3.43,10.26,3.31
1,1991,4.57,2.8,3.67,2.72,3.43,3.84,5.62,2.95,4.27,3.5,3.29,11.88,3.16
2,1992,4.99,2.87,3.79,2.46,2.31,4.0,5.86,3.13,4.95,3.5,3.11,12.62,3.12
3,1993,4.97,2.6,3.48,1.94,2.03,3.62,4.59,2.8,4.27,2.79,2.65,14.57,2.46
4,1994,4.58,3.18,4.45,2.73,2.4,4.18,4.68,3.26,4.38,2.76,3.79,14.69,3.29


In [15]:
df_missing = extract_sol(df_missing,".1")

1
['Malta.1']


In [16]:
# Clean up columns for DB
underscoreCol(df_missing_list)
underscoreCol(df_good)
underscoreCol(df_missing)
underscoreCol(df_drop)

Changing Index(['Country', 'Missing_Rows'], dtype='object')...
Index(['Country', 'Missing_Rows'], dtype='object')
Changing Index(['Calendar years', 'Austria', 'Cyprus', 'Denmark', 'Finland', 'France',
       'Germany', 'Italy', 'Netherlands', 'Portugal', 'Spain', 'Sweden',
       'Japan', 'Norway'],
      dtype='object')...
Index(['Calendar_years', 'Austria', 'Cyprus', 'Denmark', 'Finland', 'France',
       'Germany', 'Italy', 'Netherlands', 'Portugal', 'Spain', 'Sweden',
       'Japan', 'Norway'],
      dtype='object')
Changing Index(['Calendar years', 'Belgium', 'Bulgaria', 'Czech Republic', 'Hungary',
       'Latvia', 'Lithuania', 'Luxembourg', 'Poland', 'Slovakia', 'Slovenia',
       'Russian Federation', 'Switzerland', 'United States'],
      dtype='object')...
Index(['Calendar_years', 'Belgium', 'Bulgaria', 'Czech_Republic', 'Hungary',
       'Latvia', 'Lithuania', 'Luxembourg', 'Poland', 'Slovakia', 'Slovenia',
       'Russian_Federation', 'Switzerland', 'United_States'],
      

In [17]:
# Save to csv
df_missing_list.to_csv('retail_missing_data_list.csv')

In [18]:
list = df_good.columns

res = [sub.replace(' ', '_') for sub in list] 
df_good.columns = res 
df_good.columns

Index(['Calendar_years', 'Austria', 'Cyprus', 'Denmark', 'Finland', 'France',
       'Germany', 'Italy', 'Netherlands', 'Portugal', 'Spain', 'Sweden',
       'Japan', 'Norway'],
      dtype='object')

In [19]:
df_good.to_csv('retail_complete_data.csv', index=False)

In [20]:
df_missing.to_csv('retail_missing_data.csv')

In [21]:
df_drop.to_csv('ret_orginal_category_data.csv')