# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [19]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
import re
import pandas as pd
import pdpipe as pdp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from functools import partial
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from scripts.processing import load_geoscheme_df, get_country_region_superregion, get_country_to_dict_mapping, encode_country_column
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS

In [21]:
data_loader = DataLoader()
api_df = data_loader.load_api_data()

In [22]:
api_df.head()

Unnamed: 0,id,release_id,title,year,country,genre,style,label,community_have,community_want,formats,master_id,thumb_url,release_url
0,1,11918321,Bing Crosby - Crosbyana,1934,Canada,b'\x80\x03]q\x00(X\x04\x00\x00\x00Jazzq\x01X\x...,b'\x80\x03]q\x00(X\x08\x00\x00\x00Big Bandq\x0...,b'\x80\x03]q\x00(X\x05\x00\x00\x00Deccaq\x01X\...,1,2,b'\x80\x03]q\x00}q\x01(X\x03\x00\x00\x00qtyq\x...,1354381,https://img.discogs.com/J7vwmOhWMdUJ5vYuaYZvIj...,https://api.discogs.com/releases/11918321
1,2,10550056,Bing Crosby - Crosbyana,1934,US,b'\x80\x03]q\x00(X\x04\x00\x00\x00Jazzq\x01X\x...,b'\x80\x03]q\x00(X\x08\x00\x00\x00Big Bandq\x0...,b'\x80\x03]q\x00X\x05\x00\x00\x00Deccaq\x01a.',4,4,b'\x80\x03]q\x00}q\x01(X\x03\x00\x00\x00qtyq\x...,1354381,https://img.discogs.com/aLpqYUso3yY53XDHwiqqB-...,https://api.discogs.com/releases/10550056
2,3,6910984,Tommy Dorsey And His Clambake Seven - Tommy Do...,1935,US,b'\x80\x03]q\x00X\x04\x00\x00\x00Jazzq\x01a.',b'\x80\x03]q\x00X\x05\x00\x00\x00Swingq\x01a.',b'\x80\x03]q\x00(X\r\x00\x00\x00Swing Classicq...,4,2,b'\x80\x03]q\x00}q\x01(X\x0c\x00\x00\x00descri...,0,https://img.discogs.com/FkuaZ7cqdBt5-TvS2ck0hP...,https://api.discogs.com/releases/6910984
3,4,12959431,"Paul Whiteman And His Orchestra, Bix Beiderbec...",1936,US,b'\x80\x03]q\x00X\x04\x00\x00\x00Jazzq\x01a.',b'\x80\x03]q\x00X\x05\x00\x00\x00Swingq\x01a.',b'\x80\x03]q\x00X\x06\x00\x00\x00Victorq\x01a.',3,3,b'\x80\x03]q\x00}q\x01(X\x0c\x00\x00\x00descri...,0,https://img.discogs.com/Wg31rg_x6TtOMz-jIDC-3l...,https://api.discogs.com/releases/12959431
4,5,4453491,Jimmie Lunceford And His Orchestra - For Dance...,1937,Canada,b'\x80\x03]q\x00X\x04\x00\x00\x00Jazzq\x01a.',b'\x80\x03]q\x00.',b'\x80\x03]q\x00(X\x05\x00\x00\x00Deccaq\x01X\...,5,2,b'\x80\x03]q\x00}q\x01(X\x0c\x00\x00\x00descri...,1519538,,https://api.discogs.com/releases/4453491


In [23]:
api_df = pd.concat([api_df,api_df['title'].str.split(' - ',n=1,expand=True)],axis=1)

In [24]:
geoscheme_df = load_geoscheme_df()
country_region_continent = partial(get_country_region_superregion,geoscheme_df)

In [25]:
country_to_dict_mapping = get_country_to_dict_mapping()

In [26]:
pipeline_1 = pdp.DropDuplicates('release_id')
pipeline_1 += pdp.ColDrop(['title','id'])
pipeline_1 += pdp.ColRename({0: 'artist', 1: 'title'})
pipeline_1 += pdp.ApplyByCols(['genre','style','label','formats'],pickle.loads)
pipeline_1 += pdp.MapColVals('country',country_to_dict_mapping)

In [27]:
api_df = pipeline_1.apply(api_df)
api_df.reset_index(drop=True,inplace=True)

In [10]:
encoded_country_df = encode_country_column(api_df['country'])

100%|██████████| 29/29 [00:00<00:00, 96.75it/s]


In [13]:
encoded_country_df.columns

Index(['superregion_africa', 'superregion_americas', 'superregion_asia',
       'superregion_europe', 'superregion_oceania',
       'region_australia and new zealand', 'region_caribbean',
       'region_central america', 'region_central asia', 'region_eastern asia',
       ...
       'country_zambia', 'country_zimbabwe', 'country_aruba',
       'country_curacao', 'country_sint maarten (dutch part)',
       'country_bahrain', 'country_oman', 'country_qatar',
       'country_virgin islands (u.s.)', 'country_virgin islands (british)'],
      dtype='object', length=178)

In [16]:
api_df = pd.concat([
    api_df,
    encoded_country_df],axis=1
)

KeyError: "['country'] not found in axis"

In [18]:
api_df.head()

Unnamed: 0,release_id,year,genre,style,label,community_have,community_want,formats,master_id,thumb_url,...,country_zambia,country_zimbabwe,country_aruba,country_curacao,country_sint maarten (dutch part),country_bahrain,country_oman,country_qatar,country_virgin islands (u.s.),country_virgin islands (british)
0,11918321,1934,"[Jazz, Pop]","[Big Band, Vocal]","[Decca, The Compo Company Ltd.]",1,2,"[{'qty': '6', 'descriptions': ['10""', '78 RPM'...",1354381,https://img.discogs.com/J7vwmOhWMdUJ5vYuaYZvIj...,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10550056,1934,"[Jazz, Pop]","[Big Band, Vocal]",[Decca],4,4,"[{'qty': '6', 'descriptions': ['10""', '78 RPM'...",1354381,https://img.discogs.com/aLpqYUso3yY53XDHwiqqB-...,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6910984,1935,[Jazz],[Swing],"[Swing Classic, RCA Victor]",4,2,"[{'descriptions': ['10""', '78 RPM', 'Album', '...",0,https://img.discogs.com/FkuaZ7cqdBt5-TvS2ck0hP...,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12959431,1936,[Jazz],[Swing],[Victor],3,3,"[{'descriptions': ['10""', '78 RPM', 'Album', '...",0,https://img.discogs.com/Wg31rg_x6TtOMz-jIDC-3l...,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4453491,1937,[Jazz],[],"[Decca, Decca Records, Inc.]",5,2,"[{'descriptions': ['10""', '78 RPM', 'Album'], ...",1519538,,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Test Data Specific Transformations
- Any imputations 