# Baseline1: Extract deterministic data according to the problem description
For more details, you can find the whole project from https://github.com/KID-22/PCIC2021-Baselines. Welcome to watch, star and fork! Note that some new baselines will update soon.

### Load Data

In [3]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm

In [4]:
bigtag = np.loadtxt('./data/train/bigtag.txt',dtype=int)
bigtag

array([[  0,  26,  -1],
       [  0, 100,  12],
       [  0, 100,   2],
       ...,
       [999, 736,   4],
       [999, 776,   4],
       [999, 776,  14]])

In [5]:
choicetag = np.loadtxt('./data/train/choicetag.txt',dtype=int)
choicetag

array([[  4,  83,  45],
       [  4, 125,   4],
       [  4, 345,  12],
       ...,
       [998, 952,  43],
       [998, 989,   4],
       [998, 989,  35]])

In [6]:
movie_data = np.loadtxt('./data/train/movie.txt',dtype=int)
movie = []
for i in range(movie_data.shape[0]):
    tmp = movie_data[i,1:]
    movie.append(tmp)
movie[:5]

[array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([ 8,  9,  0, 10, 11, 12,  7, 13]),
 array([14, 15,  4, 16, 17, 18, 19, 20]),
 array([ 2,  0,  4, 21,  3,  7, 22,  6]),
 array([23,  0, 24, 25, 21, 26, 27,  7])]

In [7]:
tag_num = np.max(movie)
tag_num

1719

### Extract data

In [20]:
mat = np.zeros((1000,tag_num+1))
all_data_array = []
bigtag_array = []
choicetag_array = []

In [21]:
count = 0
# extract deterministic data from bigtag
for i in tqdm(range(bigtag.shape[0])):
    if bigtag[i][2] != -1:
        mat[bigtag[i][0]][bigtag[i][2]] = 1
        all_data_array.append([bigtag[i][0],bigtag[i][2],1])
        bigtag_array.append([bigtag[i][0],bigtag[i][2],1])
        count += 1
    if bigtag[i][2] == -1:
        for tag in movie[bigtag[i][1]]:
            mat[bigtag[i][0]][tag] = -1
            all_data_array.append([bigtag[i][0],tag,0])
            bigtag_array.append([bigtag[i][0],tag,0])

for i in tqdm(range(bigtag.shape[0])):
    if bigtag[i][2] != -1:
        for tag in movie[bigtag[i][1]]:
            if mat[bigtag[i][0]][tag] == 0:
                mat[bigtag[i][0]][tag] = -1
                all_data_array.append([bigtag[i][0],tag,0])
                bigtag_array.append([bigtag[i][0],tag,0])

100%|██████████| 8612/8612 [00:00<00:00, 162796.76it/s]
100%|██████████| 8612/8612 [00:00<00:00, 36743.30it/s]


In [22]:
count, len(all_data_array)

(7206, 47054)

In [23]:
18454 - (8612-7206)*8

7206

In [24]:
# # extract deterministic data from choicetag
for i in tqdm(range(choicetag.shape[0])):
    if choicetag[i][2] != -1:
        mat[choicetag[i][0]][choicetag[i][2]] = 1
        all_data_array.append([choicetag[i][0],choicetag[i][2],1])
        choicetag_array.append([choicetag[i][0],choicetag[i][2],1])
    if choicetag[i][2] == -1:
        for tag in movie[choicetag[i][1]]:
            mat[choicetag[i][0]][tag] = -1
            all_data_array.append([choicetag[i][0],tag,0])
            choicetag_array.append([choicetag[i][0],tag,0])
for i in tqdm(range(choicetag.shape[0])):
    if choicetag[i][2] != -1:
        for tag in movie[choicetag[i][1]]:
            if mat[choicetag[i][0]][tag] == 0:
                mat[choicetag[i][0]][tag] = -1
                all_data_array.append([choicetag[i][0],tag,0])
                choicetag_array.append([choicetag[i][0],tag,0])

100%|██████████| 1540/1540 [00:00<00:00, 107695.09it/s]
100%|██████████| 1540/1540 [00:00<00:00, 85850.61it/s]


In [25]:
len(all_data_array)

53310

In [26]:
# Unique
all_data_array = np.array(all_data_array)
print(all_data_array.shape[0])
print(np.count_nonzero(all_data_array[:,2]))
all_data_array = [tuple(row) for row in all_data_array]
all_data_array = np.unique(all_data_array, axis=0)
print(all_data_array.shape[0])
print(np.count_nonzero(all_data_array[:,2]))

53310
8510
47338
4141


In [27]:
# Unique
bigtag_array = np.array(bigtag_array)
print(bigtag_array.shape[0])
print(np.count_nonzero(bigtag_array[:,2]))
bigtag_array = [tuple(row) for row in bigtag_array]
bigtag_array = np.unique(bigtag_array, axis=0)
print(bigtag_array.shape[0])
print(np.count_nonzero(bigtag_array[:,2]))

47054
7206
42733
3889


In [28]:
# Unique
choicetag_array = np.array(choicetag_array)
print(choicetag_array.shape[0])
print(np.count_nonzero(choicetag_array[:,2]))
choicetag_array = [tuple(row) for row in choicetag_array]
choicetag_array = np.unique(choicetag_array, axis=0)
print(choicetag_array.shape[0])
print(np.count_nonzero(choicetag_array[:,2]))

6256
1304
5287
558


In [31]:
np.savetxt("./data/train/extract_bigtag.txt",np.array(bigtag_array),fmt="%d")
np.savetxt("./data/train/extract_choicetag.txt",np.array(choicetag_array),fmt="%d")
np.savetxt("./data/train/extract_alldata.txt",np.array(all_data_array),fmt="%d")

### Evaluate

In [13]:
from sklearn.metrics import roc_auc_score,accuracy_score
valid = np.loadtxt('../valid/validation.txt',dtype=int)
valid

array([[  0, 283,   0],
       [  1,  85,   0],
       [  1, 256,   1],
       ...,
       [999,  58,   0],
       [999, 126,   0],
       [999, 280,   0]])

In [14]:
print(valid.shape[0])
print(np.count_nonzero(valid[:,2]))

2039
779


In [15]:
def evaluate(val):
    val_true = val[:,2]
    val_pred = np.array([])
    for i in range(val.shape[0]):
        res_tmp = mat[val[i][0]][val[i][1]]
        if res_tmp == 1:
            val_pred = np.append(val_pred,1)
        elif res_tmp == -1:
            val_pred = np.append(val_pred,0)
        elif res_tmp == 0:
            rand = random.randint(1,100)
            if(rand>50):
                val_pred = np.append(val_pred,1)
            else:
                val_pred = np.append(val_pred,0)
    print("AUC: ",roc_auc_score(val_true, val_pred))

In [18]:
evaluate(valid)

AUC:  0.5747804470525908
