# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [121]:
import pdpipe as pdp
import pycountry as pyc

In [8]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from data.util.paths import DATA_PATH
from data.scripts.proposal_data import *

In [95]:
api_df = load_api_data()

In [96]:
api_df.head()

Unnamed: 0,id,release_id,title,year,country,genre,style,label,community_have,community_want,formats,master_id,thumb_url,release_url
0,1,11918321,Bing Crosby - Crosbyana,1934,Canada,b'\x80\x03]q\x00(X\x04\x00\x00\x00Jazzq\x01X\x...,b'\x80\x03]q\x00(X\x08\x00\x00\x00Big Bandq\x0...,b'\x80\x03]q\x00(X\x05\x00\x00\x00Deccaq\x01X\...,1,2,b'\x80\x03]q\x00}q\x01(X\x03\x00\x00\x00qtyq\x...,1354381,https://img.discogs.com/J7vwmOhWMdUJ5vYuaYZvIj...,https://api.discogs.com/releases/11918321
1,2,10550056,Bing Crosby - Crosbyana,1934,US,b'\x80\x03]q\x00(X\x04\x00\x00\x00Jazzq\x01X\x...,b'\x80\x03]q\x00(X\x08\x00\x00\x00Big Bandq\x0...,b'\x80\x03]q\x00X\x05\x00\x00\x00Deccaq\x01a.',4,4,b'\x80\x03]q\x00}q\x01(X\x03\x00\x00\x00qtyq\x...,1354381,https://img.discogs.com/aLpqYUso3yY53XDHwiqqB-...,https://api.discogs.com/releases/10550056
2,3,6910984,Tommy Dorsey And His Clambake Seven - Tommy Do...,1935,US,b'\x80\x03]q\x00X\x04\x00\x00\x00Jazzq\x01a.',b'\x80\x03]q\x00X\x05\x00\x00\x00Swingq\x01a.',b'\x80\x03]q\x00(X\r\x00\x00\x00Swing Classicq...,4,2,b'\x80\x03]q\x00}q\x01(X\x0c\x00\x00\x00descri...,0,https://img.discogs.com/FkuaZ7cqdBt5-TvS2ck0hP...,https://api.discogs.com/releases/6910984
3,4,12959431,"Paul Whiteman And His Orchestra, Bix Beiderbec...",1936,US,b'\x80\x03]q\x00X\x04\x00\x00\x00Jazzq\x01a.',b'\x80\x03]q\x00X\x05\x00\x00\x00Swingq\x01a.',b'\x80\x03]q\x00X\x06\x00\x00\x00Victorq\x01a.',3,3,b'\x80\x03]q\x00}q\x01(X\x0c\x00\x00\x00descri...,0,https://img.discogs.com/Wg31rg_x6TtOMz-jIDC-3l...,https://api.discogs.com/releases/12959431
4,5,4453491,Jimmie Lunceford And His Orchestra - For Dance...,1937,Canada,b'\x80\x03]q\x00X\x04\x00\x00\x00Jazzq\x01a.',b'\x80\x03]q\x00.',b'\x80\x03]q\x00(X\x05\x00\x00\x00Deccaq\x01X\...,5,2,b'\x80\x03]q\x00}q\x01(X\x0c\x00\x00\x00descri...,1519538,,https://api.discogs.com/releases/4453491


In [97]:
api_df['genre'].apply(pickle.loads)

0                                               [Jazz, Pop]
1                                               [Jazz, Pop]
2                                                    [Jazz]
3                                                    [Jazz]
4                                                    [Jazz]
                                ...                        
448897           [Jazz, Latin, Pop, Folk, World, & Country]
448898    [Electronic, Hip Hop, Jazz, Latin, Pop, Folk, ...
448899    [Electronic, Hip Hop, Jazz, Rock, Funk / Soul,...
448900    [Jazz, Pop, Folk, World, & Country, Stage & Sc...
448901    [Electronic, Hip Hop, Jazz, Rock, Funk / Soul,...
Name: genre, Length: 448902, dtype: object

In [103]:
api_df = pd.concat([api_df,api_df['title'].str.split(' - ',n=1,expand=True)],axis=1)

In [118]:
pipeline = pdp.DropDuplicates('release_id')
pipeline += pdp.ColDrop(['title','id'])
pipeline += pdp.ColRename({0: 'artist', 1: 'title'})
pipeline += pdp.ApplyByCols(['genre','style','label','formats'],pickle.loads)
pipeline += pdp.OneHotEncode("country")

In [119]:
pipeline.apply(api_df)

Unnamed: 0,release_id,year,genre,style,label,community_have,community_want,formats,master_id,thumb_url,...,country_Unknown,country_Uruguay,country_Uzbekistan,country_Venezuela,country_Vietnam,country_Virgin Islands,country_West Bank,country_Yugoslavia,country_Zambia,country_Zimbabwe
0,11918321,1934,"[Jazz, Pop]","[Big Band, Vocal]","[Decca, The Compo Company Ltd.]",1,2,"[{'qty': '6', 'descriptions': ['10""', '78 RPM'...",1354381,https://img.discogs.com/J7vwmOhWMdUJ5vYuaYZvIj...,...,0,0,0,0,0,0,0,0,0,0
1,10550056,1934,"[Jazz, Pop]","[Big Band, Vocal]",[Decca],4,4,"[{'qty': '6', 'descriptions': ['10""', '78 RPM'...",1354381,https://img.discogs.com/aLpqYUso3yY53XDHwiqqB-...,...,0,0,0,0,0,0,0,0,0,0
2,6910984,1935,[Jazz],[Swing],"[Swing Classic, RCA Victor]",4,2,"[{'descriptions': ['10""', '78 RPM', 'Album', '...",0,https://img.discogs.com/FkuaZ7cqdBt5-TvS2ck0hP...,...,0,0,0,0,0,0,0,0,0,0
3,12959431,1936,[Jazz],[Swing],[Victor],3,3,"[{'descriptions': ['10""', '78 RPM', 'Album', '...",0,https://img.discogs.com/Wg31rg_x6TtOMz-jIDC-3l...,...,0,0,0,0,0,0,0,0,0,0
4,4453491,1937,[Jazz],[],"[Decca, Decca Records, Inc.]",5,2,"[{'descriptions': ['10""', '78 RPM', 'Album'], ...",1519538,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448897,14600801,2020,"[Jazz, Latin, Pop, Folk, World, & Country]","[Contemporary Jazz, Bolero, Cubano, Vocal]","[Production Dessinée, Disques Dessinee Distrib...",4,3,"[{'descriptions': ['LP', 'Album', 'Reissue', '...",1046728,https://img.discogs.com/WTkTarvjjDZPXjU9Mt7Czq...,...,0,0,0,0,0,0,0,0,0,0
448898,14609482,2020,"[Electronic, Hip Hop, Jazz, Latin, Pop, Folk, ...","[Ambient, Downtempo, Hip Hop, Easy Listening, ...","[Nailuj Music, Rebel Road Studio, San Diego, R...",1,0,"[{'descriptions': ['MP3', 'Album'], 'text': '3...",0,https://img.discogs.com/qO9PXbXugSQPdCq6xzkMHc...,...,0,0,0,0,0,0,0,0,0,0
448899,14613847,2020,"[Electronic, Hip Hop, Jazz, Rock, Funk / Soul,...","[IDM, Downtempo, K-pop, Disco, Synth-pop, Tech...",[Rubber Frog Records],2,0,"[{'descriptions': ['MP3', 'Album'], 'text': '8...",0,https://img.discogs.com/alhwY1Cr1_DCL6Lc5s3QnN...,...,0,0,0,0,0,0,0,0,0,0
448900,14658017,2020,"[Jazz, Pop, Folk, World, & Country, Stage & Sc...","[Ballad, Vocal, Swing, Novelty, Musical, Music...",[Not On Label],0,0,"[{'descriptions': ['AAC', 'AIFF', 'ALAC', 'FLA...",0,https://img.discogs.com/2GclY7QQGhupGixMXK3Zkz...,...,0,0,0,0,0,0,0,0,0,0


In [123]:
sorted(api_df['country'].unique())

['Africa',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Argentina',
 'Armenia',
 'Asia',
 'Australasia',
 'Australia',
 'Australia & New Zealand',
 'Austria',
 'Azerbaijan',
 'Bahamas, The',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benelux',
 'Bermuda',
 'Bolivia',
 'Bosnia & Herzegovina',
 'Brazil',
 'Bulgaria',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central America',
 'Chile',
 'China',
 'Colombia',
 'Congo, Democratic Republic of the',
 'Congo, Republic of the',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Czech Republic & Slovakia',
 'Czechoslovakia',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Europe',
 'Faroe Islands',
 'Finland',
 'France',
 'France & Benelux',
 'French Guiana',
 'French Polynesia',
 'Gabon',
 'Georgia',
 'German Democratic Republic (GDR)',
 'Germany',
 'Germany & Switzerland',
 'Germany, Austria, & Switzerland',
 'Ghana',
 'Greece',
 'Guadel