In [0]:
import pandas as pd
from pathlib import Path
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging
import joblib
import datetime
import sys
import collections
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import itertools
import time
import multiprocessing
from sklearn.preprocessing import MinMaxScaler
import warnings
from sklearn import metrics
from functools import wraps
import wrapt

import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
import torch


from fastai.tabular import *  # Quick accesss to tabular functionality
from fastai.utils.mem import *

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

In [0]:
def configure_logging(filename):
    logger = logging.getLogger(filename)
    log_fmt = "[%(asctime)s] %(funcName)s: %(message)s"
    formatter = logging.Formatter(log_fmt)
    sh = logging.StreamHandler(sys.stdout)
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    return logger

logger = configure_logging('info')

def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))


time_index_dict = {
    'label0': [3838, 3860, 3861, 3867],  # 3807
    # 'label1': [3840, 3853, 3854, 3860],
    'label2': [3845, 3867, 3868, 3874]

}

def load_ques():
    ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
    ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
    logger.info("ques %s", ques.shape)
    ques['topic_count'] = ques['topic'].apply(lambda x: len(x))
    ques['title_t1_count'] = ques['title_t1'].apply(lambda x: 0 if x == '-1' else len(x.split(',')))
    ques['title_t2_count'] = ques['title_t2'].apply(lambda x: 0 if x == '-1' else len(x.split(',')))
    ques['desc_t1_count'] = ques['desc_t1'].apply(lambda x: 0 if x == '-1' else len(x.split(',')))
    ques['desc_t2_count'] = ques['desc_t2'].apply(lambda x: 0 if x == '-1' else len(x.split(',')))
    return ques


def load_invite(is_test=True):
    if is_test:
        df = pd.read_csv(f'{base_path}/invite_info_evaluate_1_0926.txt', sep='\t', header=None)
        df.columns = ['qid', 'uid', 'dt']
    else:
        df = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
        df.columns = ['qid', 'uid', 'dt', 'label']
    logging.info("invite shape %s, is_test %s", df.shape, is_test)

    df['day'] = extract_day(df['dt'])
    df['hour'] = extract_hour(df['dt'])
    df = df.reset_index()
    train_size = 9489162
    if is_test:
        df['index'] = df['index'] + train_size
    return df

def load_user():
    user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
    user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                    'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                    'inter_topic']
    logger.info("user %s", user.shape)

    unq = user.nunique()
    logger.info("user unq %s", unq)

    for x in unq[unq == 1].index:
        del user[x]
        logger.info('del unq==1 %s', x)

    user['follow_topic_count'] = user['follow_topic'].apply(lambda x: 0 if x == '-1' else len(x.split(',')))
    user['inter_topic_count'] = user['inter_topic'].apply(lambda x: 0 if x == '-1' else len(x.split(',')))

    # del user['follow_topic'], user['inter_topic']
    return user


def reduce(df, safe=True, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if safe:
                    df[col] = df[col].astype(np.int32)
                else:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
            else:
                if safe:
                    df[col] = df[col].astype(np.float32)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
            start_mem - end_mem) / start_mem))
    return df


def sample_data(data):
    a = data[data['label']==1]
    b = data[(data['label']==0)].sample(frac=0.2)
    return pd.concat((a,b))

base_path = Path('/content/drive/My Drive/data_set_0926')

In [0]:



user = load_user()

del user['follow_topic'], user['inter_topic']

ques = load_ques()

del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2'], ques['topic']

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']


data = load_invite(False)
data = sample_data(data)
test = load_invite()
test['is_test'] = 1
data = pd.concat((data, test), sort=False)
data['weekday'] = data['day'] % 7

del data['dt']

data = data.merge(user, on='uid', how='left')
del user

data = data.merge(ques, on='qid', how='left')
del ques

data = reduce(data, False)

data['weekday'] = data['day'] % 7
data['q_weekday'] = data['q_day'] % 7

bins = [94.0, 290.0, 308.0, 337.0, 392.0, 891]
data['score_bin'] = pd.cut(data['score'], bins=bins)

dep_var = 'label'
cat_names = ['uid', 'score_bin', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5','q_hour','q_weekday','hour','weekday']
cont_names = ['uf_b1', 'uf_b2', 'uf_b3', 'uf_b4', 'uf_b5', 'score', 'follow_topic_count', 'inter_topic_count', 'topic_count', 'title_t1_count', 'title_t2_count', 'desc_t1_count', 'desc_t2_count']
procs = [FillMissing, Categorify, Normalize]

day_end = 3867
train = data[data['day']<=day_end]
test = data[data['is_test']==1]
print(train.shape, test.shape)
train = train.reset_index(drop=True)
path = Path('.')

test = TabularList.from_df(test, path=path, cat_names=cat_names, cont_names=cont_names)


val_idx = train[(train['day']==day_end)].index.values

train = (TabularList.from_df(train, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(val_idx)
                           .label_from_df(cols=dep_var)
                           .add_test(test)
                           .databunch(bs=64))

train.show_batch(rows=10)


In [28]:
data.head()

Unnamed: 0,index,qid,uid,label,day,hour,is_test,weekday,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic_count,inter_topic_count,topic_count,title_t1_count,title_t2_count,desc_t1_count,desc_t2_count,q_day,q_hour,q_weekday,score_bin
0,7,Q190554387,M1581217469,1.0,3850,8,,0,unknown,monthly,1,0,0,0,0,MD470265,BR470265,PV002320,CT101130,PF470265,520,14,10,27,13,4,0,0,3850,6,0,"(392.0, 891.0]"
1,10,Q110462128,M848334644,1.0,3862,8,,5,female,weekly,1,0,0,0,0,MD842473,BR126824,PV929066,CT929066,PF470265,719,8,10,10,12,3,34,9,3861,23,4,"(392.0, 891.0]"
2,11,Q4265255633,M848334644,1.0,3867,16,,3,female,weekly,1,0,0,0,0,MD842473,BR126824,PV929066,CT929066,PF470265,719,8,10,4,22,5,0,0,3867,16,3,"(392.0, 891.0]"
3,12,Q885275494,M848334644,1.0,3852,8,,2,female,weekly,1,0,0,0,0,MD842473,BR126824,PV929066,CT929066,PF470265,719,8,10,3,22,8,86,16,3852,8,2,"(392.0, 891.0]"
4,16,Q3785192737,M234356600,1.0,3856,9,,6,female,weekly,1,0,0,0,0,MD353013,BR803759,PV596305,CT327012,PF072986,422,1,0,16,32,11,0,0,3850,17,0,"(392.0, 891.0]"


In [0]:
learn = tabular_learner(train, layers=[200,100], emb_szs={'uid': 64}, metrics=auc_roc_score)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,auc_roc_score,time


In [None]:
val_y = train.valid_ds.y.items
pred = learn.get_preds(ds_type=DatasetType.Valid)[0]
pred = pred.numpy()
pred = pred.flatten()
metrics.roc_auc_score(val_y, pred)

In [0]:
pred = learn.get_preds(ds_type=DatasetType.Test)[0]
score = pred.numpy()[:,1]

# end