In [1]:
from __future__ import division

import numpy as np
import pandas as pd

In [2]:
data_train = pd.read_csv("./data/train.csv")
data_train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [3]:
data_challenge = pd.read_csv("./data/challenge_data.csv")
data_challenge.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [4]:
data_test = pd.read_csv("./data/test.csv")
data_test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,CI23855
1,4577_2,4577,2,CI23933
2,4577_3,4577,3,CI24917
3,4577_4,4577,4,CI24915
4,4577_5,4577,5,CI23714


In [5]:
data_train['user_id'] = data_train['user_id'].apply(lambda x: str(x))
data_train['user_id'].describe()

count     903916
unique     69532
top       102167
freq          13
Name: user_id, dtype: object

In [6]:
data_test['user_id'] = data_test['user_id'].apply(lambda x: str(x))
data_test['user_id'].describe()

count     397320
unique     39732
top        31599
freq          10
Name: user_id, dtype: object

In [7]:
data_all = pd.concat([data_train, data_test], axis=0)
data_all = data_all.sort_values('user_id')
# data_all = data_all.sample(frac=1).reset_index(drop=True)

data_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1301236 entries, 44675 to 347408
Data columns (total 4 columns):
user_sequence         1301236 non-null object
user_id               1301236 non-null object
challenge_sequence    1301236 non-null int64
challenge             1301236 non-null object
dtypes: int64(1), object(3)
memory usage: 49.6+ MB


In [8]:
data_all['user_id'].describe()

count     1301236
unique     109264
top        102167
freq           13
Name: user_id, dtype: object

In [9]:
data_all.isnull().sum()

user_sequence         0
user_id               0
challenge_sequence    0
challenge             0
dtype: int64

In [10]:
data_all['challenge_sequence'] = data_all['challenge_sequence'].apply(lambda x: 14 - x)
data_all.head(20)

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
44675,10000_8,10000,6,CI24230
44669,10000_2,10000,12,CI24530
44670,10000_3,10000,11,CI26052
44680,10000_13,10000,1,CI25126
44679,10000_12,10000,2,CI26050
44678,10000_11,10000,3,CI23880
44677,10000_10,10000,4,CI26051
44676,10000_9,10000,5,CI25135
44668,10000_1,10000,13,CI23663
44674,10000_7,10000,7,CI23691


In [11]:
data_all = data_all[['user_id', 'challenge', 'challenge_sequence']]
data_all.info()
data_all.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1301236 entries, 44675 to 347408
Data columns (total 3 columns):
user_id               1301236 non-null object
challenge             1301236 non-null object
challenge_sequence    1301236 non-null int64
dtypes: int64(1), object(2)
memory usage: 39.7+ MB


Unnamed: 0,user_id,challenge,challenge_sequence
44675,10000,CI24230,6
44669,10000,CI24530,12
44670,10000,CI26052,11
44680,10000,CI25126,1
44679,10000,CI26050,2


In [12]:
import pandas as pd

from surprise import NMF
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

In [13]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 13))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(data_all.iloc[:], reader)

In [14]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
# algo = KNNBasic()
algo = NMF()
# algo.fit(trainset)

# from surprise import NormalPredictor, evaluate
# algo = NormalPredictor()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f4bb52ac7f0>

In [None]:
# uid = '4577'  # raw user id (as in the ratings file).
# iid = 'CI23855'  # raw item id (as in the ratings file).

# get a prediction for specific users and items, the rating is supposed to be 13.
pred_user_id_list = data_test.user_id.unique()
print(len(pred_user_id_list))

pred_challenge_id_list = data_all.challenge.unique()
print(len(pred_challenge_id_list))

# pred_challenge_id_list = data_challenge['challenge_ID'].unique()
# print(len(pred_challenge_id_list))

39732
5502


In [None]:
predictions = pd.DataFrame(columns=['user', 'challenge', 'rating'])

for pred_user_id in pred_user_id_list[:]:
    for pred_challenge_id in pred_challenge_id_list[:]:
        repeat_list = data_test['challenge'][data_test['user_id'] == pred_user_id].tolist()
        if pred_challenge_id not in repeat_list:
            pred = algo.predict(pred_user_id, pred_challenge_id, r_ui=None, verbose=False)
            if float(pred.est) > 7.0:
                predictions = predictions.append(pd.Series([pred_user_id, pred_challenge_id, pred.est], \
                                                           index=['user', 'challenge', 'rating']), \
                                                           ignore_index=True)
# predictions.iloc[0:9:3, 2] = 3.0
# predictions.iloc[:15, 1] = 'CI22222'
predictions.info()
predictions.head(10)

In [None]:
predictions['rank'] = predictions.groupby('user')['rating'].rank(ascending=False)
predictions.head()

In [None]:
predictions = predictions.sort_values(['rank'])
predictions = predictions.groupby('user').head(3)
predictions = predictions.sort_values(['user', 'rank']).reset_index(drop=True)

predictions.head(20)

In [None]:
current = None
seq = 11
results = pd.DataFrame(columns=['user_sequence', 'challenge'])
for index, row in predictions.iterrows():
    if row['user'] != current:
        seq = 11
    
    result_user_seq = row['user'] + '_' + str(seq)
    result_challenge = row['challenge']
    results = results.append(pd.Series([row['user'] + '_' + str(seq), row['challenge']], index=['user_sequence', 'challenge']), ignore_index=True)
    
    seq += 1
    current = row['user']

results.head(9)

In [None]:
results.to_csv('./submission/submission_nmf.csv', encoding='utf-8', index=False)