In [None]:
%load_ext autoreload
%autoreload 2
from infovis21.datamodel.Track import Track
from infovis21.datamodel.User import User
from infovis21.mongodb import MongoAccess as ma
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
# I know the without is a bit redundant as we could just use the total size of the collection which would be faster but it's also a sanity check for the edge case values of genres

pipeline_with_genre = [
    {'$set': {'n_of_genres': {'$size' : {'$ifNull': [ "$genres", [] ]}} } },
    {'$match': {'n_of_genres': {'$gt': 0}}}
]
res_with_genre = list(ma.coll_artists.aggregate(pipeline_with_genre))
print(f'With genre info in artists {len(res_with_genre)}')

pipeline_without_genre = [
    {'$set': {'n_of_genres': {'$size' : {'$ifNull': [ "$genres", [] ]}} } },
    {'$match': {'n_of_genres': {'$eq': 0}}}
]
res_without_genre = list(ma.coll_artists.aggregate(pipeline_without_genre))
print(f'Without genre info in artists {len(res_without_genre)}')

res_with_genre = list(ma.coll_tracks.aggregate(pipeline_with_genre))
print(f'With genre info in tracks {len(res_with_genre)}')
res_without_genre = list(ma.coll_tracks.aggregate(pipeline_without_genre))
print(f'Without genre info in tracks {len(res_without_genre)}')

In [None]:
# pipeline = [
#     {'$unwind' : '$genres'},
#     { '$unset': ['_id'] },
# ]
# features = [
#     "danceability",
#     "duration_ms",
#     "energy",
#     "instrumentalness",
#     "liveness",
#     "loudness",
#     "speechiness",
#     "tempo",
#     "valence",
#     "popularity",
#     "key",
#     "mode",
#     "acousticness",
#     'explicit',
#     'year',
#   ]
# target = ['genres']

df_artists = pd.DataFrame(ma.get_collection(ma.coll_artists))



In [None]:
### a big share of artists don't have a genre associated with them in the data_w_genres.csv file.
# However, this data seems to not have any missing genres for artists

d_i = df_artists

In [None]:
# drop strings for imputation
d_i = d_i.drop(['name', 'popularity', 'id', 'labels', 'preview_url'],axis=1)

In [None]:
# from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
d_i['genres'] = enc.fit_transform(d_i.genres.astype(str))

In [None]:
# replace missing genre label with NaN
# d_i.genres = d_i.genres.replace( ### LABEL WHICH EQUALS MISSING VALUE #### , np.NaN)


# parameter tuning
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rmse = lambda y, yhat: np.sqrt(mean_squared_error(y, yhat))


def optimize_k(data, target):
    errors = []
    for k in range(1, 20, 2):
        imputer = KNNImputer(n_neighbors=k)
        imputed = imputer.fit_transform(data)
        d_imputed = pd.DataFrame(imputed, columns=d_i.columns)

        X = d_imputed.drop(target, axis=1)
        y = d_imputed[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        error = rmse(y_test, preds)
        errors.append({'K': k, 'RMSE': error})

    return errors

In [None]:
k_errors = optimize_k(data=d_i, target='genres')

# in 1,20,2 lowest error is K = 9


In [None]:
## imputation

imputer = KNNImputer(n_neighbors=9) # as 'optimised' by function above

# dataframe_new = pd.DataFrame(imputer.fit_transform(dataframe))
d_i = pd.DataFrame(imputer.fit_transform(d_i), columns = d_i.columns)

In [None]:
# after imputation, values are floats, to be recognised by encoder, we have to revert it to int

d_i.genres = d_i.genres.astype(int)

# to add cat values artists and popularity back in

d_i = pd.concat([d_i, df_artists.name, df_artists.popularity], axis=1)

In [None]:
# associate label with genre again

d_i.genres = enc.inverse_transform(d_i.genres)

# d_i.genres.head()