In [1]:
import pandas as pd
import numpy as np
import pickle
import csv
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

def create_train_validation_test(df, genre):
    # 4000 songs from genre
    df['genre'] = df['genre'].apply(lambda x: x if x==genre else 'no')

    df_genre = df.loc[df['genre'] == genre]

    # 4000 songs from not genre
    df_non_genre = df.loc[df['genre'] != genre].sample(n=4000)
    
    subset = df_genre.append(df_non_genre)
    # Divide pop rock into train and test
    train_genre, test_genre = train_test_split(df_genre, test_size=0.2)

    # Divide non pop rock into train and test
    train_non_genre, test_non_genre = train_test_split(df_non_genre, test_size = 0.2)

    # Combine pop-rock and non-genre into train/test
#     train = train_genre.append(train_non_genre)
#     test = test_genre.append(test_non_genre)
    train, test = train_test_split(subset, test_size=0.2)
#     train, test = train_test_split(df, test_size=0.2)

    train, validation = train_test_split(train, test_size=0.2)

    # training, validation, and test data

    X_train = train.drop(['genre', 'track_id'], axis = 1)
    y_train = train[['genre']]

    X_validation = validation.drop(['genre', 'track_id'], axis = 1)
    y_validation = validation[['genre']]

    X_test = test.drop(['genre','track_id'], axis = 1)
    y_test = test[['genre']]

    return X_train, list(y_train['genre']), X_validation, list(y_validation['genre']), X_test, list(y_test['genre'])

In [2]:
def evaluate(model, X_test, y_test):
    pred = model.predict(X_test)
    acc = confusion_matrix(y_true = y_test, y_pred = pred), accuracy_score(y_true = y_test, y_pred = pred)
    return acc[0].ravel(), acc[1]

In [3]:
def test_models(genre):
    data = pd.read_csv('./records_4000.tsv', sep='\t')
    data['genre'] = data['genre'].str[:-1]
    one_hot_key = pd.get_dummies(data['key'])
    data = data.drop(['key'], axis=1)
    newcols = list(data.columns)
    data = pd.concat([data,one_hot_key],axis = 1)
    for keyn in range(12):
        newcols.append('key_'+str(keyn))
    data.columns = newcols
    data = data.fillna(0)

    X_train, y_train, X_validation, y_validation, X_test, y_test = create_train_validation_test(data, genre)
    models = []
#     random_forest = RandomForestClassifier(n_estimators = 100)
#     random_forest.fit(X_train, y_train)
    models.append(LogisticRegression(penalty = 'l1'))
    models[0].fit(X_train, y_train)
#     tree_classifier = DecisionTreeClassifier(max_depth=6)
#     tree_classifier.fit(X_train, y_train)

#     logistic = LogisticRegression()
#     logistic.fit(X_train, y_train)

#     models = [tree_classifier, random_forest, logistic]
#     modelnames = ['tree', 'forest','logistic']
    results = []
    scores = []
    print(genre)
#     data2 = pd.read_csv('./records_test.tsv', sep='\t')
#     data2['genre'] = data2['genre'].str[:-1]
#     one_hot_key2 = pd.get_dummies(data2['key'])
#     data2 = data2.drop(['key'], axis=1)
#     newcols2 = list(data2.columns)
#     data2 = pd.concat([data2,one_hot_key2],axis = 1)
#     for keyn in range(12):
#         newcols2.append('key_'+str(keyn))
#     data2.columns = newcols2
#     data2 = data2.fillna(0)
#     data2['genre'] = data2['genre'].apply(lambda x: x if x==genre else 'no')
#     X_test = data2.drop(['genre', 'track_id'], axis = 1)
#     y_test = data2[['genre']]
    for i in range(len(models)):
        results.append(evaluate(models[i], X_test, y_test))
        results.append(evaluate(models[i], X_train, y_train))
#         scores.append((modelnames[i],results[-1][1]))
    print([i[1] for i in results])
    # scores = sorted(scores, key = itemgetter(0))
    # print(scores[-1], scores[-1][-1]-scores[0][-1])
#     print([i[0] for i in scores])
#     print([i[1] for i in scores])
        

In [None]:
# tn, fp, fn, tp

In [None]:
genres = ["Pop_Rock","Electronic","Rap","Blues","Country","Jazz","RnB","Reggae","Latin","New Age","International","Folk","Vocal"]
for i in genres:
    test_models(i)

Pop_Rock
[0.72375, 0.75039062499999998]
Electronic
[0.74312500000000004, 0.73007812500000002]
Rap
[0.78625, 0.80449218749999996]
Blues
[0.66062500000000002, 0.65468749999999998]
