In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import progressbar
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

In [2]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
songs = pd.read_csv('../Data/songs.csv', usecols = ['song_id', 'artist_name', 'language']).astype(str)

In [3]:
#positives = len(train[train.target == 1])
#negatives = len(train[train.target == 0])

In [4]:
cols = ['source_system_tab', 'source_screen_name', 'source_type']

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 3/3 [00:45<00:00, 14.72s/it]


In [5]:
#Case 1: User missing in training but song is present (in training)
msno_song = test[~test.msno.isin(train.msno) & test.song_id.isin(train.song_id)]
msno_song = msno_song.merge(songs, on='song_id', how='left')

#Case 2: song missing in training but User exists
song_msno = test[~test.song_id.isin(train.song_id) & test.msno.isin(train.msno)]

#Case 3: both user and song is new
both_missing = test[~test.msno.isin(train.msno) & ~test.song_id.isin(train.song_id)]

#print msno_song.shape (163181, 6)
#print song_msno.shape (299288, 6)
#print both_missing.shape (20837, 6)

In [6]:
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

In [7]:
def binning(x):
    if x == 0:
        return 0
    elif x <= 100:
        return 1
    elif x <= 500:
        return 2
    elif x <= 1000:
        return 3
    elif x <= 1500:
        return 4
    return 5

In [8]:
#Case 1 solution: Use only song statistics when user is new
#has anyone listened to the song repeatedly
repeated_songs_dict = train[train.target == 1].groupby(['song_id'])['target'].count().to_dict()
def repeated_songs(x):
    try:
        count = repeated_songs_dict[x]
        return count
    except KeyError:
        return 0

#has anyone listened to the artist again?
repeated_artists_dict = train[train.target == 1].groupby(['artist_name'])['target'].count().to_dict()
def repeated_artists(x):
    try:
        count = repeated_artists_dict[x]
        return count
    except KeyError:
        return 0
    
#how frequently have people repeatedly listened to the language
repeated_lang_dict = train[train.target == 1].groupby(['language'])['target'].count().to_dict()
def repeated_lang(x):
    try:
        count = repeated_lang_dict[x]
        return count
    except KeyError:
        return 0 
    
# number of times an artist has been played 
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        return 0
        
# number of times a song has been played
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        return 0

In [9]:
msno_song['repeated_song'] = msno_song['song_id'].map(str).apply(repeated_songs).astype(np.int32)
msno_song['count_artist_played'] = msno_song['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
msno_song['count_song_played'] = msno_song['song_id'].map(str).apply(count_song_played).astype(np.int64)
msno_song['repeated_artist'] = test['artist_name'].map(str).apply(repeated_artists).astype(np.int32)
msno_song['repeated_lang'] = test['language'].map(str).apply(repeated_lang).astype(np.int32)

In [10]:
#Case 2: Use only User Statistics when song is new
#how frequently has the user repeated songs?
repeated_dict = train[train.target == 1].groupby(['msno'])['target'].count().to_dict()
def user_repeated_songs(x):
    try:
        count = repeated_dict[x]
        return count
    except KeyError:
        return 0

In [11]:
song_msno['user_repeated'] = song_msno['msno'].map(str).apply(user_repeated_songs).astype(np.int32)

In [12]:
# General Source statistics
# number of times a source_system_tab has helped in song repetition
source_system_dict = train[train.target == 1].groupby(['source_system_tab'])['target'].count().to_dict()
def source_system_repeated(x):
    try:
        count = source_system_dict[x]
        return binning(count)
    except KeyError:
        return 0    

# number of times source_screen_name has contributed to target=1
source_screen_dict = train[train.target == 1].groupby(['source_screen_name'])['target'].count().to_dict()
def source_screen_repeated(x):
    try:
        count = source_screen_dict[x]
        return binning(count)
    except KeyError:
        return 0

# number of times source_type has contributed to target=1
source_type_dict = train[train.target == 1].groupby(['source_type'])['target'].count().to_dict()
def source_type_repeated(x):
    try:
        count = source_type_dict[x]
        return binning(count)
    except KeyError:
        return 0

In [13]:
msno_song['s_system_repeat'] = msno_song['source_system_tab'].map(str).apply(source_system_repeated).astype(np.int32)
song_msno['s_system_repeat'] = song_msno['source_system_tab'].map(str).apply(source_system_repeated).astype(np.int32)
both_missing['s_system_repeat'] = both_missing['source_system_tab'].map(str).apply(source_system_repeated).astype(np.int32)

msno_song['s_screen_repeat'] = msno_song['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int32)
song_msno['s_screen_repeat'] = song_msno['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int32)
both_missing['s_screen_repeat'] = both_missing['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int32)

msno_song['s_type_repeat'] = msno_song['source_type'].map(str).apply(source_type_repeated).astype(np.int32)
song_msno['s_type_repeat'] = song_msno['source_type'].map(str).apply(source_type_repeated).astype(np.int32)
both_missing['s_type_repeat'] = both_missing['source_type'].map(str).apply(source_type_repeated).astype(np.int32)

In [14]:
train['s_system_repeat'] = train['source_system_tab'].map(str).apply(source_system_repeated).astype(np.int32)
train['s_screen_repeat'] = train['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int32)
train['s_type_repeat'] = train['source_type'].map(str).apply(source_type_repeated).astype(np.int32)

In [15]:
predicted = []
Y_list = []
Y_list.extend(both_missing.id)
Y_list.extend(song_msno.id)
Y_list.extend(msno_song.id)

In [16]:
X_prev = np.array(train.drop(['target', 'msno', 'song_id', 'artist_name', 'language'], axis=1), dtype=np.int32)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_prev, train.target)
model = SelectFromModel(lsvc, prefit=True)
X = model.transform(X_prev)

X_train, X_val, Y_train, Y_val = train_test_split(X, train.target, test_size = 0.2, shuffle = False)
X_test_prev = np.array(both_missing.drop(['id', 'msno', 'song_id'], axis=1), dtype=np.int32)
X_test = model.transform(X_test_prev)

#Train for Case 3: both missing
gnb = MultinomialNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_val)
print "Number of mislabeled points out of a total %d points : %d" % (len(X_val),(Y_val != y_pred).sum())
print "ROC AUC Score: %f" %roc_auc_score(Y_val, y_pred)
predicted.extend(gnb.predict_proba(X_test))

Number of mislabeled points out of a total 1475484 points : 741590
ROC AUC Score: 0.517279


In [17]:
# For Case 2
train['user_repeated'] = train['msno'].map(str).apply(user_repeated_songs).astype(np.int32)
# Drop string attributes
X_prev = np.array(train.drop(['target', 'msno', 'song_id', 'artist_name', 'language'], axis=1), dtype=np.int32)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_prev, train.target)
model = SelectFromModel(lsvc, prefit=True)
X = model.transform(X_prev)

X_train, X_val, Y_train, Y_val = train_test_split(X, train.target, test_size = 0.2, shuffle = False)
X_test_prev = np.array(song_msno.drop(['id', 'msno', 'song_id'], axis=1), dtype=np.int32)
X_test = model.transform(X_test_prev)

gnb = MultinomialNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_val)
print "Number of mislabeled points out of a total %d points : %d" % (len(X_val),(Y_val != y_pred).sum())
print "ROC AUC Score: %f" %roc_auc_score(Y_val, y_pred)
predicted.extend(gnb.predict_proba(X_test))

# Drop this attribute since not used for Case 1
train = train.drop(['user_repeated'], axis=1)

Number of mislabeled points out of a total 1475484 points : 637240
ROC AUC Score: 0.542848


In [18]:
# For Case 1
train['repeated_song'] = train['song_id'].map(str).apply(repeated_songs).astype(np.int32)
train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
train['count_song_played'] = train['song_id'].map(str).apply(count_song_played).astype(np.int64)
train['repeated_artist'] = train['artist_name'].map(str).apply(repeated_artists).astype(np.int32)
train['repeated_lang'] = train['language'].map(str).apply(repeated_lang).astype(np.int32)

X_prev = np.array(train.drop(['target', 'msno', 'song_id', 'artist_name', 'language'], axis=1), dtype=np.int32)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_prev, train.target)
model = SelectFromModel(lsvc, prefit=True)
X = model.transform(X_prev)

X_train, X_val, Y_train, Y_val = train_test_split(X, train.target, test_size = 0.2, shuffle = False) 
X_test_prev = np.array(msno_song.drop(['id', 'msno', 'song_id', 'artist_name', 'language'], axis=1), dtype=np.int32)
X_test = model.transform(X_test_prev)

gnb = MultinomialNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_val)
print "Number of mislabeled points out of a total %d points : %d" % (len(X_val),(Y_val != y_pred).sum())
print "ROC AUC Score: %f" %roc_auc_score(Y_val, y_pred)
predicted.extend(gnb.predict_proba(X_test))

Number of mislabeled points out of a total 1475484 points : 585152
ROC AUC Score: 0.552688


In [21]:
lst2 = []
for x in range(len(predicted)):
    lst2.append(predicted[x][1])

In [22]:
headers = ['id', 'target']
new_test = pd.DataFrame(columns=headers)
new_test['id'] = Y_list
new_test['target'] = lst2
new_test.to_csv('../Test/cold_start_pred_2.csv', index=False, header=headers)