In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr
from nltk.stem import PorterStemmer
import re
import os
import pickle
from scipy import sparse
import pylab as pl

In [2]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')
resources=pd.read_csv('data/resources.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
print(train.dtypes)
print(test.dtypes)
print(resources.dtypes)

In [None]:
df=pd.concat([train,test])
df.loc[df.project_essay_3.isna(),['project_essay_2','project_essay_3']]=df.loc[df.project_essay_4.isna(),['project_essay_3','project_essay_2']].values
df[['project_essay_2','project_essay_4']]=df[['project_essay_2','project_essay_4']].fillna("")
df['project_essay_1']=df.apply(lambda x:x['project_essay_1']+x['project_essay_2'],axis=1)
df['project_essay_2']=df.apply(lambda x:x['project_essay_3']+x['project_essay_4'],axis=1)
df=df.drop(['project_essay_3','project_essay_4'],axis=1)

In [None]:
resources['total_price']=resources['quantity'] * resources['price']

R=resources.groupby('id').agg({'description':'count','quantity':'sum','price':'sum','total_price':'sum'})\
    .rename(columns={'description':'items'})
R['avg_price']=R['total_price']/R['quantity']

for func in ['min','max','mean','std']:
    R=R.join(resources.groupby('id').agg({'quantity':func,'price':func,'total_price':func}).\
           rename(columns={'quantity':'quantity_'+func,'price':'price_'+func,'total_price':'total_price_'+func}))

R=R.join(resources.groupby('id').agg({'description':lambda x:' '.join(x.astype(str))}).rename(
    columns={'description':'resource_description'}))

df=df.join(R,on='id')

df['price_category']=pd.cut(df['total_price'], [0, 50, 100, 250, 500, 1000,np.inf])

for c in ['quantity', 'price', 'total_price']:
    df['max%s_min%s'%(c,c)] = df['%s_max'%c] - df['%s_min'%c]

In [None]:
le = LabelEncoder()
df['teacher_id'] = le.fit_transform(df['teacher_id'])
df['teacher_gender_unknown'] = df.teacher_prefix.apply(lambda x:int(x not in ['Ms.', 'Mrs.', 'Mr.']))

statFeatures = []
for col in ['school_state', 'teacher_id', 'teacher_prefix', 'teacher_gender_unknown', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'teacher_number_of_previously_posted_projects']:
    Stat = df[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_stat'})
    Stat /= Stat.sum()
    df = df.join(Stat, on=col)

In [None]:
%%time
numFeatures=[df.columns[i] for i,j in enumerate(df.dtypes) if j == 'float64' and not (df.columns[i]=='project_is_approved') ]
T2 = df[numFeatures+['project_is_approved']].copy()
Ttr = T2[-pd.isna(df.project_is_approved)]
Tar_tr = Ttr['project_is_approved'].values
n = 10
inx = [np.random.randint(0, Ttr.shape[0], int(Ttr.shape[0]/n)) for k in range(n)]
# inx is used for crossvalidation of calculating the correlation and p-value
Corr = {}
for c in numFeatures:
    # since some values might be 0s, I use x+1 to avoid missing some important relations
    C1,P1=np.nanmean([pearsonr(Tar_tr[inx[k]],   (1+Ttr[c].iloc[inx[k]])) for k in range(n)], 0)
    C2,P2=np.nanmean([pearsonr(Tar_tr[inx[k]], 1/(1+Ttr[c].iloc[inx[k]])) for k in range(n)], 0)
    if P2<P1:
        T2[c] = 1/(1+T2[c])
        Corr[c] = [C2,P2]
    else:
        T2[c] = 1+T2[c]
        Corr[c] = [C1,P1]
        
        
polyCol = []
thrP = 0.01
thrC = 0.02
print('columns \t\t\t Corr1 \t\t Corr2 \t\t Corr Combined')
for i, c1 in enumerate(numFeatures[:-1]):
    C1, P1 = Corr[c1]
    for c2 in numFeatures[i+1:]:
        C2, P2 = Corr[c2]
        V = T2[c1] * T2[c2]
        Vtr = V[-pd.isna(T2.project_is_approved)].values
        C, P = np.nanmean([pearsonr(Tar_tr[inx[k]], Vtr[inx[k]]) for k in range(n)], 0)
        if P<thrP and abs(C) - max(abs(C1),abs(C2)) > thrC:
            df[c1+'_'+c2+'_poly'] = V
            
            

In [None]:
dateCol = 'project_submitted_datetime'
def getTimeFeatures(T):
    df['year'] = df[dateCol].apply(lambda x: x.year)
    df['month'] = df[dateCol].apply(lambda x: x.month)
    df['day'] = df[dateCol].apply(lambda x: x.day)
    df['dow'] = df[dateCol].apply(lambda x: x.dayofweek)
    df['hour'] = df[dateCol].apply(lambda x: x.hour)
    df['days'] = (df[dateCol]-df[dateCol].min()).apply(lambda x: x.days)
    return T

df[dateCol] = pd.to_datetime(df[dateCol])
df = getTimeFeatures(df)

timeFeatures = ['year', 'month', 'day', 'dow', 'hour', 'days']
for col in timeFeatures:
    Stat = df[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_stat'})
    Stat /= Stat.sum()
    df = df.join(Stat, on=col)
    

In [None]:
def getCatFeatures(T, Col):
    vectorizer = CountVectorizer(binary=True,
                                 ngram_range=(1,1),
                                 tokenizer=lambda x:[a.strip() for a in x.split(',')])
    return vectorizer.fit_transform(T[Col].fillna(''))

X_tp = getCatFeatures(df, 'teacher_prefix')
X_ss = getCatFeatures(df, 'school_state')
X_pgc = getCatFeatures(df, 'project_grade_category')
X_psc = getCatFeatures(df, 'project_subject_categories')
X_pssc = getCatFeatures(df, 'project_subject_subcategories')

X_cat = sparse.hstack((X_tp, X_ss, X_pgc, X_psc, X_pssc))

In [None]:
p = PorterStemmer()
def wordPreProcess(sentence):
    return ' '.join([p.stem(x.lower()) for x in re.split('\W', sentence) if len(x) >= 1])



def getTextFeatures(T, Col, max_features=10000, ngrams=(1,2), verbose=True):
    if verbose:
        print('processing: ', Col)
    vectorizer = CountVectorizer(stop_words=None,
                                 preprocessor=wordPreProcess,
                                 max_features=max_features,
                                 binary=True,
                                 ngram_range=ngrams)
    X = vectorizer.fit_transform(T[Col])
    return X, vectorizer.get_feature_names()

n_es1, n_es2, n_prs, n_rd, n_pt = 3000, 8000, 2000, 3000, 1000
X_es1, feat_es1 = getTextFeatures(df, 'project_essay_1', max_features=n_es1)
X_es2, feat_es2 = getTextFeatures(df, 'project_essay_2', max_features=n_es2)
X_prs, feat_prs = getTextFeatures(df, 'project_resource_summary', max_features=n_prs)
X_rd, feat_rd = getTextFeatures(df, 'resource_description', max_features=n_rd, ngrams=(1,3))
X_pt, feat_pt = getTextFeatures(df, 'project_title', max_features=n_pt)

In [None]:
X_txt = sparse.hstack((X_es1, X_es2, X_prs, X_rd, X_pt))
del X_es1, X_es2, X_prs, X_rd, X_pt

In [None]:
def preprocess_str(string):
    string = re.sub(r'(\")', ' ', string)
    string = re.sub(r'(\r)', ' ', string)
    string = re.sub(r'(\n)', ' ', string)
    string = re.sub(r'(\r\n)', ' ', string)
    string = re.sub(r'(\\)', ' ', string)
    string = re.sub(r'\t', ' ', string)
    string = re.sub(r'\:', ' ', string)
    string = re.sub(r'\"\"\"\"', ' ', string)
    string = re.sub(r'_', ' ', string)
    string = re.sub(r'\+', ' ', string)
    string = re.sub(r'\=', ' ', string)
    

In [3]:
df_path='data/df.p'
if os.path.exists(df_path):
    df=pd.read_pickle('data/df.p')
else:
    df.to_pickle('data/df.p')
    
txt_path = 'data/txt.npy'
if os.path.exists(txt_path):
    X_txt = pickle.load(open(txt_path, 'rb'))
else:
    pickle.dump(X_txt, open(txt_path, 'wb'))

cat_path = 'data/cat.npy'
if os.path.exists(cat_path):
    X_cat = pickle.load(open(cat_path, 'rb'))
else:
    pickle.dump(X_cat, open(cat_path, 'wb'))

In [4]:
from sklearn.preprocessing import StandardScaler
numFeatures=[df.columns[i] for i,j in enumerate(df.dtypes) if j == 'float64' and not (df.columns[i]=='project_is_approved') ]
X = sparse.hstack((X_txt, X_cat, StandardScaler().fit_transform(df[numFeatures].fillna(0)))).tocsr()

Xtr = X[np.argwhere(-pd.isna(df.project_is_approved)).reshape(-1,)]
Xts = X[np.argwhere(pd.isna(df.project_is_approved)).reshape(-1,)]
Ttr_tar = df[-pd.isna(df.project_is_approved)]['project_is_approved'].values
Tts = df[-pd.isna(df.project_is_approved)][['id','project_is_approved']]

In [5]:
import os

os.environ['KERAS_BACKEND'] = 'tensorflow'

from keras.layers import Input, Dense, Flatten, concatenate, Dropout, Embedding, SpatialDropout1D
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.models import Model
from keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

n_es1, n_es2, n_prs, n_rd, n_pt = 3000, 8000, 2000, 3000, 1000
def breakInput(X1):
    X2 = []
    i = 0
    for n in [n_es1, n_es2, n_prs, n_rd, n_pt, X_cat.shape[1], len(numFeatures)]:
        X2.append(X1[:,i:i+n])
        i += n
    return X2

def getModel(HLs, Drop=0.25, OP=optimizers.Adam()):
    temp = []
    inputs_txt = []
    for n in [n_es1, n_es2, n_prs, n_rd, n_pt]:
        input_txt = Input((n, ))
        X_feat = Dropout(Drop)(input_txt)
        X_feat = Dense(int(n/100), activation="linear")(X_feat)
        X_feat = Dropout(Drop)(X_feat)
        temp.append(X_feat)
        inputs_txt.append(input_txt)

    x_1 = concatenate(temp)
#     x_1 = Dense(20, activation="relu")(x_1)
    x_1 = Dense(50, activation="relu")(x_1)
    x_1 = Dropout(Drop)(x_1)

    input_cat = Input((X_cat.shape[1], ))
    x_2 = Embedding(2, 10, input_length=X_cat.shape[1])(input_cat)
#     x_2 = SpatialDropout1D(Drop)(x_2)
    x_2 = Flatten()(x_2)

    input_num = Input((len(numFeatures), ))
    x_3 = Dropout(Drop)(input_num)
    
    x = concatenate([x_1, x_2, x_3])

    for HL in HLs:
        x = Dense(HL, activation="relu")(x)
        x = Dropout(Drop)(x)

    output = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs_txt+[input_cat, input_num], outputs=output)
    model.compile(
            optimizer=OP,
            loss='binary_crossentropy',
            metrics=['binary_accuracy'])
    return model

def trainNN(X_train, X_val, Tar_train, Tar_val, HL=[50], Drop=0.5, OP=optimizers.Adam()):
    file_path='NN.h5'
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')
    early = EarlyStopping(monitor="val_loss", mode="min", patience=6)
    lr_reduced = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.5,
                                   patience=2,
                                   verbose=1,
                                   epsilon=3e-4,
                                   mode='min')

    model = getModel(HL, Drop, OP)
    model.fit(breakInput(X_train), Tar_train, validation_data=(breakInput(X_val), Tar_val),
                        verbose=2, epochs=1, batch_size=1000, callbacks=[early, lr_reduced, checkpoint])
    model.load_weights(file_path)
    return model

nCV = 1 # should be ideally larger
for i in range(21, 22):
    X_train, X_val, Tar_train, Tar_val = train_test_split(Xtr, Ttr_tar, test_size=0.15, random_state=i, stratify=Ttr_tar)
    model = trainNN(X_train, X_val, Tar_train, Tar_val, HL=[50], Drop=0.5, OP=optimizers.Adam())
    Yvl3 = model.predict(breakInput(X_val)).squeeze()
    Yts3 = model.predict(breakInput(Xts)).squeeze()

Using TensorFlow backend.


Train on 154768 samples, validate on 27312 samples
Epoch 1/1
 - 26s - loss: 0.4191 - binary_accuracy: 0.8395 - val_loss: 0.3708 - val_binary_accuracy: 0.8482

Epoch 00001: val_loss improved from inf to 0.37082, saving model to NN.h5


In [6]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 3000)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 8000)         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 2000)         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 3000)         0                                            
__________________________________________________________________________________________________
input_5 (I

In [7]:
model.get_config()

{'name': 'model_1',
 'layers': [{'name': 'input_1',
   'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 3000),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_1'},
   'inbound_nodes': []},
  {'name': 'input_2',
   'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 8000),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_2'},
   'inbound_nodes': []},
  {'name': 'input_3',
   'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 2000),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_3'},
   'inbound_nodes': []},
  {'name': 'input_4',
   'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 3000),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_4'},
   'inbound_nodes': []},
  {'name': 'input_5',
   'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 1000),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_5'},
   'inbound_nod

In [8]:
def nn_batch_generator(X_data, y_data, batch_size,shuffle=False):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    if shuffle:
        index = np.arange(np.shape(y_data)[0])
        np.random.shuffle(index)
        X =  X_data[index, :]
        y =  y_data[index]
    else:
        index = np.arange(np.shape(y_data)[0])
    while counter < number_of_batches:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        y_batch = y_data[index_batch]
        counter += 1
        yield np.array(X_batch),y_batch
            

In [23]:
import tensorflow as tf
import tensorlayer as tl
tf.reset_default_graph()


HLs=[50]
drop=0.6
learning_rate=0.001
placholder={}
input_net={}
# define placeholder
x2_placeholder= tf.placeholder(tf.int32, shape=[None, X_cat.shape[1]], name='x2')
input_num=len(numFeatures)
x3_placeholder = tf.placeholder(tf.float32, shape=[None, input_num], name='x3')
for n,label in zip([n_es1, n_es2, n_prs, n_rd, n_pt],['n_es1', 'n_es2', 'n_prs', 'n_rd', 'n_pt']):
    placholder[label] = tf.placeholder(tf.float32, shape=[None, n], name=label)    

y_ = tf.placeholder(tf.float32, shape=[None, ], name='target')

def model(x, is_train=True, reuse=False):
    placholder= x[0]
    x2_placeholder= x[1]
    x3_placeholder= x[2]
    with tf.variable_scope("binary_classification", reuse=reuse):
        for n,label in zip([n_es1, n_es2, n_prs, n_rd, n_pt],['n_es1', 'n_es2', 'n_prs', 'n_rd', 'n_pt']):
            input_net[label] = tl.layers.InputLayer(placholder[label], name=label +'_input_layer')
            input_net[label] = tl.layers.DropoutLayer(input_net[label], keep=drop,is_train=is_train, name=label+'_drop1')
            input_net[label] = tl.layers.DenseLayer(input_net[label], n_units=int(n/100), name=label+'_dense')
            input_net[label] = tl.layers.DropoutLayer(input_net[label],is_train=is_train, keep=drop, name=label+'drop2')


        x1=tl.layers.ConcatLayer([input_net[key] for key in input_net], 1, name ='input_concat_layer')
        x1=tl.layers.DenseLayer(x1, n_units=50., name=label)
        x1=tl.layers.DropoutLayer(x1, keep=drop,is_train=is_train, name='input_concat_layer_drop')

        x2=tl.layers.EmbeddingInputlayer(inputs=x2_placeholder, vocabulary_size=2, embedding_size=10, name='x2_embed')
        #x2=tl.layers.DropoutLayer(x2, keep=drop, name=label+'embed_drop')
        x2 = tl.layers.FlattenLayer(x2)

        x3 = tl.layers.InputLayer(x3_placeholder, name='x3_layer')
        x3 = tl.layers.DropoutLayer(x3, keep=drop,is_train=is_train, name='x3_embed_drop')

        x = tl.layers.ConcatLayer([x1,x2,x3])

        for HL in HLs:
            x = tl.layers.DenseLayer(x, n_units=50.,act=tf.nn.relu, name='x_dense_'+str(HL))
            x = tl.layers.DropoutLayer(x, keep=drop,is_train=is_train, name='x_drop_'+str(HL))

        x = tl.layers.DenseLayer(x, n_units=1,
                                        act = tf.sigmoid,
                                        name='output_layer')

    return x

net_train = model([placholder,x2_placeholder,x3_placeholder], is_train=True, reuse=False)
net_test = model([placholder,x2_placeholder,x3_placeholder], is_train=False, reuse=True)

y=net_train.outputs

# cost = tl.cost.binary_cross_entropy(y, y_,name='cross_entropy')
cost = tl.cost.mean_squared_error(y, y_,name='mse')
    

y2 = net_test.outputs
cost_test = tl.cost.binary_cross_entropy(y2, y_, name='cross_entropy31233')
correct_prediction = tf.equal(tf.cast(tf.argmax(y2,1),tf.float32), y_)
acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

train_params = net_train.all_params


train_op = tf.train.AdamOptimizer(1e-4).minimize(cost, var_list = train_params)
#train_op = tf.train.RMSPropOptimizer(learning_rate=0.001, momentum=0.1).minimize(cost)

[TL] InputLayer  binary_classification/n_es1_input_layer: (?, 3000)
[TL] DropoutLayer n_es1_drop1: keep:0.600000 is_fix:False
[TL] DenseLayer  n_es1_dense: 300 identity
[TL] DropoutLayer n_es1drop2: keep:0.600000 is_fix:False
[TL] InputLayer  binary_classification/n_es2_input_layer: (?, 8000)
[TL] DropoutLayer n_es2_drop1: keep:0.600000 is_fix:False
[TL] DenseLayer  n_es2_dense: 800 identity
[TL] DropoutLayer n_es2drop2: keep:0.600000 is_fix:False
[TL] InputLayer  binary_classification/n_prs_input_layer: (?, 2000)
[TL] DropoutLayer n_prs_drop1: keep:0.600000 is_fix:False
[TL] DenseLayer  n_prs_dense: 200 identity
[TL] DropoutLayer n_prsdrop2: keep:0.600000 is_fix:False
[TL] InputLayer  binary_classification/n_rd_input_layer: (?, 3000)
[TL] DropoutLayer n_rd_drop1: keep:0.600000 is_fix:False
[TL] DenseLayer  n_rd_dense: 300 identity
[TL] DropoutLayer n_rddrop2: keep:0.600000 is_fix:False
[TL] InputLayer  binary_classification/n_pt_input_layer: (?, 1000)
[TL] DropoutLayer n_pt_drop1: kee

In [24]:
sess = tf.InteractiveSession()
tl.layers.initialize_global_variables(sess)

n_epoch =50
batch_size = 50
print_freq = 100
X_train, X_val, Tar_train, Tar_val = train_test_split(Xtr, Ttr_tar, test_size=0.15, random_state=i, stratify=Ttr_tar)

for epoch in range(n_epoch):
    batch=nn_batch_generator(X_train, Tar_train,1000)
    for x_t,y_t in batch:
        x_t_es1, x_t_es2, x_t_prs, x_t_rd, x_t_pt,x_t_x2,x_t_x3 = breakInput(x_t)
        y_t_batch = y_t
        feed_dict={}
        
        feed_dict[placholder['n_es1']]=x_t_es1
        feed_dict[placholder['n_es2']]=x_t_es2
        feed_dict[placholder['n_prs']]=x_t_prs
        feed_dict[placholder['n_rd']]=x_t_rd
        feed_dict[placholder['n_pt']]=x_t_pt
        feed_dict[x2_placeholder]=x_t_x2
        feed_dict[x3_placeholder]=x_t_x3
        feed_dict[y_]=y_t_batch
        
        feed_dict.update(net_train.all_drop)  # enable noise layers
        sess.run(train_op, feed_dict=feed_dict)
        #print("cost:{} accuray:{}".format(c,a))
        if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
            
            train_loss, train_acc, n_batch = 0, 0, 0
            batcha=nn_batch_generator(X_train, Tar_train,1000)
            for x_t_a,y_t_a in batcha:
                x_t_es1_a, x_t_es2_a, x_t_prs_a, x_t_rd_a, x_t_pt_a,x_t_x2_a,x_t_x3_a = breakInput(x_t_a)
                y_t_batch_a = y_t_a
                feed_dict_a={}
                dp_dict = tl.utils.dict_to_one(net_train.all_drop)  # disable noise layers
                
                feed_dict_a[placholder['n_es1']]=x_t_es1
                feed_dict_a[placholder['n_es2']]=x_t_es2
                feed_dict_a[placholder['n_prs']]=x_t_prs
                feed_dict_a[placholder['n_rd']]=x_t_rd
                feed_dict_a[placholder['n_pt']]=x_t_pt
                feed_dict_a[x2_placeholder]=x_t_x2
                feed_dict_a[x3_placeholder]=x_t_x3
                feed_dict_a[y_]=y_t_batch
                feed_dict_a.update(dp_dict)

                err, ac = sess.run([cost_test, acc], feed_dict=feed_dict_a)
                train_loss += err
                train_acc += ac
                n_batch += 1
            print("   train loss: %f" % (train_loss / n_batch))
            print("   train acc: %f" % (train_acc / n_batch))




   train loss: 675.754761
   train acc: 0.169000
   train loss: 610.341248
   train acc: 0.151000


KeyboardInterrupt: 