In [5]:
import pandas as pd
import numpy as np

In [6]:
# Read CSV into pandas dataframe.
df = pd.read_csv('trans_retail_prices.csv')

In [7]:
# Preview df
df.head()

Unnamed: 0,Calendar years,Unnamed: 1,European Union,Austria,Belgium,Bulgaria,Cyprus,Czech Republic,Denmark,Finland,...,Slovenia,Spain,Sweden,United Kingdom 1,Unnamed: 26,Japan,Norway,Russian Federation,Switzerland,USA
0,1990,,,4.9,3.27,,2.83,,3.81,2.98,...,,3.62,3.43,10.55,,10.26,3.31,,4.83,2.97
1,1991,,,4.57,2.92,,2.8,,3.67,2.72,...,,3.5,3.29,10.41,,11.88,3.16,,4.36,2.81
2,1992,,,4.99,3.05,,2.87,,3.79,2.46,...,,3.5,3.11,10.09,,12.62,3.12,,4.52,2.58
3,1993,,,4.97,2.78,,2.6,,3.48,1.94,...,,2.79,2.65,8.44,,14.57,2.46,,4.27,2.47
4,1994,,,4.58,3.42,,3.18,,4.45,2.73,...,,2.76,3.79,11.36,,14.69,3.29,,4.5,3.4


## Peek Analysis
It is immediately evident the transpose in excel created some minor problems: 

(a) the data set contains columns intended for naming elements in sets (i.e. 'European Union').

(b) there are unnamed rows with nothing in them.

(c) there are columns with names appended 1 (i.e. Malta 1) which represent special types of coffee (soluble).

In [11]:
# Dtypes look clean
df.dtypes

Calendar years           int64
Unnamed: 1             float64
European Union         float64
   Austria             float64
   Belgium             float64
   Bulgaria            float64
   Cyprus              float64
   Czech Republic      float64
   Denmark             float64
   Finland             float64
   France              float64
   Germany             float64
   Hungary             float64
   Italy               float64
   Latvia              float64
   Lithuania           float64
   Luxembourg          float64
   Malta 1             float64
   Netherlands         float64
   Poland              float64
   Portugal            float64
   Slovakia            float64
   Slovenia            float64
   Spain               float64
   Sweden              float64
   United Kingdom 1    float64
Unnamed: 26            float64
Japan                  float64
Norway                 float64
Russian Federation     float64
Switzerland            float64
USA                    float64
dtype: o

In [12]:
# Get column names and turn into a list for iteration
cols = list(df.columns.astype(str))

count = 0
lst_garb = []

# Count and print list of unwanted column names
for col in cols:
    if "Unnamed" in col:
        count += 1
        lst_garb.append(col)
        
print(count)
print(lst_garb)

2
['Unnamed: 1', 'Unnamed: 26']


In [13]:
# df1 dropped df cols
df1 = df.drop(columns=lst_garb)
df1.head()

Unnamed: 0,Calendar years,European Union,Austria,Belgium,Bulgaria,Cyprus,Czech Republic,Denmark,Finland,France,...,Slovakia,Slovenia,Spain,Sweden,United Kingdom 1,Japan,Norway,Russian Federation,Switzerland,USA
0,1990,,4.9,3.27,,2.83,,3.81,2.98,3.73,...,,,3.62,3.43,10.55,10.26,3.31,,4.83,2.97
1,1991,,4.57,2.92,,2.8,,3.67,2.72,3.43,...,,,3.5,3.29,10.41,11.88,3.16,,4.36,2.81
2,1992,,4.99,3.05,,2.87,,3.79,2.46,2.31,...,,,3.5,3.11,10.09,12.62,3.12,,4.52,2.58
3,1993,,4.97,2.78,,2.6,,3.48,1.94,2.03,...,,,2.79,2.65,8.44,14.57,2.46,,4.27,2.47
4,1994,,4.58,3.42,,3.18,,4.45,2.73,2.4,...,,,2.76,3.79,11.36,14.69,3.29,,4.5,3.4


In [23]:
# assign cols with new df1 columns
cols = df1.columns[1:]

# 29 rows of data expected
exp_row = len(df1.index)

# Empty lists for iteration and appending
lst_col_drop = []
lst_col_miss = []
lst_col_good = []

# Set of lists to create a df for missing
lst_key_miss = []
lst_val_miss = []

# Collect columns with missing data for a dataframe
lst_dict_miss = []

for idx, col in enumerate(cols):
    count_na = 0
    for row in df1[col]:
        if row == '' or pd.isnull(row):
            count_na += 1
            
    # Country has any missing values append
    if count_na > 0 and count_na < exp_row: 
        key = col
        val = count_na
        
        lst_key_miss.append(key)
        lst_val_miss.append(val)  
        
        print(f"""
        x{'-'*4}(Missing Values){'-'*4}x
        {key} : {val}
        """)
        
    # Column only has missing values append
    if count_na == exp_row:
        key = col
        val = count_na
        
        lst_col_drop.append(col)
        
        print(f"""
        #{'-'*4}(Region Category){'-'*4}#
        {key} : {val}
        """)
    
    # Country has no missing values append
    if count_na == 0:
        key = col
        val = count_na
        
        lst_col_good.append(col)
        
        print(f"""
        ${'-'*4}(Good Series){'-'*4}$
        {key} : {val}
        """)
        
print(f"""
=====
Summary
-----
{len(df1.columns)} total columns.

{len(lst_col_drop)} columns to drop.
{len(lst_key_miss)} columns with missing data.
{len(lst_col_good)} columns with complete data.
""")


        #----(Region Category)----#
        European Union : 29
        

        $----(Good Series)----$
           Austria : 0
        

        x----(Missing Values)----x
           Belgium : 5
        

        x----(Missing Values)----x
           Bulgaria : 12
        

        $----(Good Series)----$
           Cyprus : 0
        

        x----(Missing Values)----x
           Czech Republic : 7
        

        $----(Good Series)----$
           Denmark : 0
        

        $----(Good Series)----$
           Finland : 0
        

        $----(Good Series)----$
           France : 0
        

        $----(Good Series)----$
           Germany : 0
        

        x----(Missing Values)----x
           Hungary : 4
        

        $----(Good Series)----$
           Italy : 0
        

        x----(Missing Values)----x
           Latvia : 3
        

        x----(Missing Values)----x
           Lithuania : 7
        

        x----(Missing Values)----x
           Luxembourg

=====
Summary
-----
30 total columns.

1 columns to drop.
14 columns with missing data.
14 columns with complete data.

In [19]:
# Assign regions with missing values to df
df_missing_list = pd.DataFrame(data=({
    'Country': lst_key_miss, 
    'Missing_Rows': lst_val_miss
}))


df_missing_list.to_csv('retail_missing_data_list.csv')

In [20]:
# Save to csv
df_good = df1[lst_col_good]
df_good.to_csv('retail_complete_data.csv')

In [21]:
df_missing = df1[lst_key_miss]
df_missing.to_csv('retail_missing_data.csv')

In [22]:
df_drop = df1[lst_col_drop]
df_drop.to_csv('ret_orginal_category_data.csv')