In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import progressbar
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

In [2]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
songs = pd.read_csv('../Data/songs.csv').astype(str)

In [3]:
cols = ['source_system_tab', 'source_screen_name', 'source_type']

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 3/3 [00:44<00:00, 14.47s/it]


In [4]:
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

In [5]:
def binning(x):
    if x == 0:
        return 0
    elif x <= 100:
        return 1
    elif x <= 500:
        return 2
    elif x <= 1000:
        return 3
    elif x <= 1500:
        return 4
    return 5

In [6]:
repeated_songs_dict = train[train.target == 1].groupby(['song_id'])['target'].count().to_dict()
def repeated_songs(x):
    try:
        count = repeated_songs_dict[x]
        return count
    except KeyError:
        return 0

#has anyone listened to the artist again?
repeated_artists_dict = train[train.target == 1].groupby(['artist_name'])['target'].count().to_dict()
def repeated_artists(x):
    try:
        count = repeated_artists_dict[x]
        return count
    except KeyError:
        return 0
    
#how frequently have people repeatedly listened to the language
repeated_lang_dict = train[train.target == 1].groupby(['language'])['target'].count().to_dict()
def repeated_lang(x):
    try:
        count = repeated_lang_dict[x]
        return count
    except KeyError:
        return 0 
    
# number of times an artist has been played 
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        return 0
        
# number of times a song has been played
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        return 0

In [7]:
train['repeated_song'] = train['song_id'].map(str).apply(repeated_songs).astype(np.int32)
train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
train['count_song_played'] = train['song_id'].map(str).apply(count_song_played).astype(np.int64)
train['repeated_artist'] = train['artist_name'].map(str).apply(repeated_artists).astype(np.int32)
train['repeated_lang'] = train['language'].map(str).apply(repeated_lang).astype(np.int32)

In [8]:
repeated_dict = train[train.target == 1].groupby(['msno'])['target'].count().to_dict()
def user_repeated_songs(x):
    try:
        count = repeated_dict[x]
        return count
    except KeyError:
        return 0
    
source_system_dict = train[train.target == 1].groupby(['source_system_tab'])['target'].count().to_dict()
def source_system_repeated(x):
    try:
        count = source_system_dict[x]
        return binning(count)
    except KeyError:
        return 0    

# number of times source_screen_name has contributed to target=1
source_screen_dict = train[train.target == 1].groupby(['source_screen_name'])['target'].count().to_dict()
def source_screen_repeated(x):
    try:
        count = source_screen_dict[x]
        return binning(count)
    except KeyError:
        return 0

# number of times source_type has contributed to target=1
source_type_dict = train[train.target == 1].groupby(['source_type'])['target'].count().to_dict()
def source_type_repeated(x):
    try:
        count = source_type_dict[x]
        return binning(count)
    except KeyError:
        return 0

In [9]:
train['user_repeated'] = train['msno'].map(str).apply(user_repeated_songs).astype(np.int32)
train['s_system_repeat'] = train['source_system_tab'].map(str).apply(source_system_repeated).astype(np.int32)
train['s_screen_repeat'] = train['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int32)
train['s_type_repeat'] = train['source_type'].map(str).apply(source_type_repeated).astype(np.int32)

In [10]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')
    
# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

In [11]:
train['genre_ids'].fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)

train['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)

train['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)

train['artist_name'].fillna('no_artist',inplace=True)
train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.int8)

In [12]:
print train.columns

Index([u'msno', u'song_id', u'source_system_tab', u'source_screen_name',
       u'source_type', u'target', u'song_length', u'genre_ids', u'artist_name',
       u'composer', u'lyricist', u'language', u'repeated_song',
       u'count_artist_played', u'count_song_played', u'repeated_artist',
       u'repeated_lang', u'user_repeated', u's_system_repeat',
       u's_screen_repeat', u's_type_repeat', u'genre_ids_count',
       u'lyricists_count', u'composer_count', u'artist_count',
       u'song_lang_boolean'],
      dtype='object')


In [15]:
X = np.array(train.drop(['target', 'msno', 'song_id', 'artist_name', 'genre_ids', 'composer', 'lyricist', 'language', 'song_length'], axis=1), dtype=np.int32)

In [16]:
clf = ExtraTreesClassifier()
clf = clf.fit(X, train.target)
clf.feature_importances_

array([ 0.03058996,  0.03644548,  0.05053757,  0.10365024,  0.05402262,
        0.08960584,  0.05357351,  0.00835636,  0.54474298,  0.        ,
        0.        ,  0.        ,  0.00321956,  0.00813599,  0.0135738 ,
        0.00253208,  0.00101401])