# Music Recommender Data Collection

### Import Libraries and Data

In [None]:
import numpy as np
import pandas as pd
import sqlite3 as sql

Note: Data collected from Pitchfork Reviews (Kaggle)
https://www.kaggle.com/nolanbconaway/pitchfork-data

### Import data (sql)

In [None]:
#import from sql
db = sql.connect('./data/raw/pitchfork_reviews.sqlite')

In [None]:
#import scores data from sql
scores = pd.read_sql('SELECT reviewid, score, title, artist, url FROM reviews', db)
scores.head(1)

In [None]:
#import content data from sql
content = pd.read_sql('SELECT reviewid, content FROM content', db)
content.head(1)

In [None]:
#import genre data from sql
genres = pd.read_sql('SELECT reviewid, genre FROM genres', db)
genres.head(1)

In [None]:
#import year data from sql
year = pd.read_sql('SELECT reviewid, year FROM years', db)
year.head(1)

### Create merged dataframe with all music review data

In [None]:
#merge all data on reviewid
merged = pd.merge(scores, content, on='reviewid')
merged2 = pd.merge(merged, genres, on='reviewid')
merged3 = pd.merge(merged2, year, on='reviewid')

In [None]:
merged3.shape

In [None]:
merged3.head(1)

In [None]:
#check for missing values
merged3.isnull().sum()

In [None]:
merged3.shape

In [None]:
#drop any null values or duplicates

In [None]:
merged3.dropna(subset=['artist'], inplace=True)
merged3.dropna(subset=['title'], inplace=True)
merged3.dropna(subset=['content'], inplace=True)

In [None]:
merged3.drop_duplicates(subset=['content'], inplace=True)
merged3.dropna(axis=0, subset=['content'], inplace=True)

### Export merged dataframe for EDA, modeling, etc.

In [None]:
#export to csv
merged3.to_csv('./data/clean/pitchfork.csv', index=False)