In [1]:
# Load DATA

import json
import pandas as pd


def load_data(file_name):
    with open(file_name + "/labels.json") as f:
        data_json = json.load(f)
    data = list()
    for i in data_json.keys():
        with open(file_name + "/texts/" + i) as f:
            data.append([data_json[i]["artist"], f.read().replace("\n", " ")])    
    return pd.DataFrame(data, columns = ['artist', 'song']) 


train_df = load_data("train")
test_df = load_data("test")

In [2]:
# Preprocessing

artist_map = {"Тартак": 0, "Океан Ельзи": 1}

train_df.artist = train_df.artist.map(artist_map)
test_df.artist = test_df.artist.map(artist_map)

X, Y = train_df["song"], train_df["artist"]

In [3]:
# Modeling

In [4]:
# Logistic regression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
tfid = TfidfTransformer()
classifier = LogisticRegression()

model1 = Pipeline([
    ('vectorizer', vectorizer),
    ('tfid', tfid),
    ('classifier', classifier)
])

model1.fit(X, Y)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                ('tfid',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [5]:
# Decision tree

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
tfid = TfidfTransformer()
classifier = DecisionTreeClassifier()

model2 = Pipeline([
    ('vectorizer', vectorizer),
    ('tfid', tfid),
    ('classifier', classifier)
])

model2.fit(X, Y)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
       

In [6]:
# SVM

from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
tfid = TfidfTransformer()
classifier = SGDClassifier(random_state=42)

model3 = Pipeline([
    ('vectorizer', vectorizer),
    ("tfid", tfid),
    ('classifier', classifier)
])

model3.fit(X, Y)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                ('classifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
              

In [7]:
# Random Forest

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
tfid = TfidfTransformer()
classifier = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')

model4 = Pipeline([
    ('vectorizer', vectorizer),
    ('tfid', tfid),
    ('classifier', classifier)
])

model4.fit(X, Y)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [8]:
# Estimation
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

def score(model):
    pred = model.predict(test_df["song"])
    print("AUC score", roc_auc_score(test_df["artist"], pred))
    print("Accuracy", accuracy_score(test_df["artist"], pred))

In [9]:
models = [model1, model2, model3, model4]
for i in range(len(models)): 
    print("Model " + str(i + 1))
    score(models[i])
    print("####")

Model 1
AUC score 0.875
Accuracy 0.8809523809523809
####
Model 2
AUC score 0.7818181818181817
Accuracy 0.7857142857142857
####
Model 3
AUC score 0.9500000000000001
Accuracy 0.9523809523809523
####
Model 4
AUC score 0.775
Accuracy 0.7857142857142857
####
