In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr
from nltk.stem import PorterStemmer
import re
import os
import pickle
from scipy import sparse
import pylab as pl

In [2]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')
resources=pd.read_csv('data/resources.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print(train.dtypes)
print(test.dtypes)
print(resources.dtypes)

id                                              object
teacher_id                                      object
teacher_prefix                                  object
school_state                                    object
project_submitted_datetime                      object
project_grade_category                          object
project_subject_categories                      object
project_subject_subcategories                   object
project_title                                   object
project_essay_1                                 object
project_essay_2                                 object
project_essay_3                                 object
project_essay_4                                 object
project_resource_summary                        object
teacher_number_of_previously_posted_projects     int64
project_is_approved                              int64
dtype: object
id                                              object
teacher_id                                      obj

In [8]:
df=pd.concat([train,test])
df.loc[df.project_essay_3.isna(),['project_essay_2','project_essay_3']]=df.loc[df.project_essay_4.isna(),['project_essay_3','project_essay_2']].values
df[['project_essay_2','project_essay_4']]=df[['project_essay_2','project_essay_4']].fillna("")
df['project_essay_1']=df.apply(lambda x:x['project_essay_1']+x['project_essay_2'],axis=1)
df['project_essay_2']=df.apply(lambda x:x['project_essay_3']+x['project_essay_4'],axis=1)
df=df.drop(['project_essay_3','project_essay_4'],axis=1)

In [10]:
resources['total_price']=resources['quantity'] * resources['price']

R=resources.groupby('id').agg({'description':'count','quantity':'sum','price':'sum','total_price':'sum'})\
    .rename(columns={'description':'items'})
R['avg_price']=R['total_price']/R['quantity']

for func in ['min','max','mean','std']:
    R=R.join(resources.groupby('id').agg({'quantity':func,'price':func,'total_price':func}).\
           rename(columns={'quantity':'quantity_'+func,'price':'price_'+func,'total_price':'total_price_'+func}))

R=R.join(resources.groupby('id').agg({'description':lambda x:' '.join(x.astype(str))}).rename(
    columns={'description':'resource_description'}))

df=df.join(R,on='id')

df['price_category']=pd.cut(df['total_price'], [0, 50, 100, 250, 500, 1000,np.inf])

for c in ['quantity', 'price', 'total_price']:
    df['max%s_min%s'%(c,c)] = df['%s_max'%c] - df['%s_min'%c]

In [13]:
le = LabelEncoder()
df['teacher_id'] = le.fit_transform(df['teacher_id'])
df['teacher_gender_unknown'] = df.teacher_prefix.apply(lambda x:int(x not in ['Ms.', 'Mrs.', 'Mr.']))

statFeatures = []
for col in ['school_state', 'teacher_id', 'teacher_prefix', 'teacher_gender_unknown', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'teacher_number_of_previously_posted_projects']:
    Stat = df[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_stat'})
    Stat /= Stat.sum()
    df = df.join(Stat, on=col)

In [16]:
%%time
numFeatures=[df.columns[i] for i,j in enumerate(df.dtypes) if j == 'float64' and not (df.columns[i]=='project_is_approved') ]
T2 = df[numFeatures+['project_is_approved']].copy()
Ttr = T2[-pd.isna(df.project_is_approved)]
Tar_tr = Ttr['project_is_approved'].values
n = 10
inx = [np.random.randint(0, Ttr.shape[0], int(Ttr.shape[0]/n)) for k in range(n)]
# inx is used for crossvalidation of calculating the correlation and p-value
Corr = {}
for c in numFeatures:
    # since some values might be 0s, I use x+1 to avoid missing some important relations
    C1,P1=np.nanmean([pearsonr(Tar_tr[inx[k]],   (1+Ttr[c].iloc[inx[k]])) for k in range(n)], 0)
    C2,P2=np.nanmean([pearsonr(Tar_tr[inx[k]], 1/(1+Ttr[c].iloc[inx[k]])) for k in range(n)], 0)
    if P2<P1:
        T2[c] = 1/(1+T2[c])
        Corr[c] = [C2,P2]
    else:
        T2[c] = 1+T2[c]
        Corr[c] = [C1,P1]
        
        
polyCol = []
thrP = 0.01
thrC = 0.02
print('columns \t\t\t Corr1 \t\t Corr2 \t\t Corr Combined')
for i, c1 in enumerate(numFeatures[:-1]):
    C1, P1 = Corr[c1]
    for c2 in numFeatures[i+1:]:
        C2, P2 = Corr[c2]
        V = T2[c1] * T2[c2]
        Vtr = V[-pd.isna(T2.project_is_approved)].values
        C, P = np.nanmean([pearsonr(Tar_tr[inx[k]], Vtr[inx[k]]) for k in range(n)], 0)
        if P<thrP and abs(C) - max(abs(C1),abs(C2)) > thrC:
            df[c1+'_'+c2+'_poly'] = V
            
            

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


columns 			 Corr1 		 Corr2 		 Corr Combined




CPU times: user 1.8 s, sys: 84 ms, total: 1.88 s
Wall time: 1.54 s


In [20]:
dateCol = 'project_submitted_datetime'
def getTimeFeatures(T):
    df['year'] = df[dateCol].apply(lambda x: x.year)
    df['month'] = df[dateCol].apply(lambda x: x.month)
    df['day'] = df[dateCol].apply(lambda x: x.day)
    df['dow'] = df[dateCol].apply(lambda x: x.dayofweek)
    df['hour'] = df[dateCol].apply(lambda x: x.hour)
    df['days'] = (df[dateCol]-df[dateCol].min()).apply(lambda x: x.days)
    return T

df[dateCol] = pd.to_datetime(df[dateCol])
df = getTimeFeatures(df)

timeFeatures = ['year', 'month', 'day', 'dow', 'hour', 'days']
for col in timeFeatures:
    Stat = df[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_stat'})
    Stat /= Stat.sum()
    df = df.join(Stat, on=col)
    

In [84]:
def getCatFeatures(T, Col):
    vectorizer = CountVectorizer(binary=True,
                                 ngram_range=(1,1),
                                 tokenizer=lambda x:[a.strip() for a in x.split(',')])
    return vectorizer.fit_transform(T[Col].fillna(''))

X_tp = getCatFeatures(df, 'teacher_prefix')
X_ss = getCatFeatures(df, 'school_state')
X_pgc = getCatFeatures(df, 'project_grade_category')
X_psc = getCatFeatures(df, 'project_subject_categories')
X_pssc = getCatFeatures(df, 'project_subject_subcategories')

X_cat = sparse.hstack((X_tp, X_ss, X_pgc, X_psc, X_pssc))

In [76]:
p = PorterStemmer()
def wordPreProcess(sentence):
    return ' '.join([p.stem(x.lower()) for x in re.split('\W', sentence) if len(x) >= 1])



def getTextFeatures(T, Col, max_features=10000, ngrams=(1,2), verbose=True):
    if verbose:
        print('processing: ', Col)
    vectorizer = CountVectorizer(stop_words=None,
                                 preprocessor=wordPreProcess,
                                 max_features=max_features,
                                 binary=True,
                                 ngram_range=ngrams)
    X = vectorizer.fit_transform(T[Col])
    return X, vectorizer.get_feature_names()

n_es1, n_es2, n_prs, n_rd, n_pt = 3000, 8000, 2000, 3000, 1000
X_es1, feat_es1 = getTextFeatures(df, 'project_essay_1', max_features=n_es1)
X_es2, feat_es2 = getTextFeatures(df, 'project_essay_2', max_features=n_es2)
X_prs, feat_prs = getTextFeatures(df, 'project_resource_summary', max_features=n_prs)
X_rd, feat_rd = getTextFeatures(df, 'resource_description', max_features=n_rd, ngrams=(1,3))
X_pt, feat_pt = getTextFeatures(df, 'project_title', max_features=n_pt)

processing:  project_essay_1
processing:  project_essay_2
processing:  project_resource_summary
processing:  resource_description
processing:  project_title


In [81]:
X_txt = sparse.hstack((X_es1, X_es2, X_prs, X_rd, X_pt))
del X_es1, X_es2, X_prs, X_rd, X_pt

In [None]:
def preprocess_str(string):
    string = re.sub(r'(\")', ' ', string)
    string = re.sub(r'(\r)', ' ', string)
    string = re.sub(r'(\n)', ' ', string)
    string = re.sub(r'(\r\n)', ' ', string)
    string = re.sub(r'(\\)', ' ', string)
    string = re.sub(r'\t', ' ', string)
    string = re.sub(r'\:', ' ', string)
    string = re.sub(r'\"\"\"\"', ' ', string)
    string = re.sub(r'_', ' ', string)
    string = re.sub(r'\+', ' ', string)
    string = re.sub(r'\=', ' ', string)
    

In [2]:
df_path='data/df.p'
if os.path.exists(df_path):
    df=pd.read_pickle('data/df.p')
else:
    df.to_pickle('data/df.p')
    
txt_path = 'data/txt.npy'
if os.path.exists(txt_path):
    X_txt = pickle.load(open(txt_path, 'rb'))
else:
    pickle.dump(X_txt, open(txt_path, 'wb'))

cat_path = 'data/cat.npy'
if os.path.exists(cat_path):
    X_cat = pickle.load(open(cat_path, 'rb'))
else:
    pickle.dump(X_cat, open(cat_path, 'wb'))

In [28]:
from sklearn.preprocessing import StandardScaler
numFeatures=[df.columns[i] for i,j in enumerate(df.dtypes) if j == 'float64' and not (df.columns[i]=='project_is_approved') ]
X = sparse.hstack((X_txt, X_cat, StandardScaler().fit_transform(df[numFeatures].fillna(0)))).tocsr()

Xtr = X[np.argwhere(-pd.isna(df.project_is_approved)).reshape(-1,)]
Xts = X[np.argwhere(pd.isna(df.project_is_approved)).reshape(-1,)]
Ttr_tar = df[-pd.isna(df.project_is_approved)]['project_is_approved'].values
Tts = df[-pd.isna(df.project_is_approved)][['id','project_is_approved']]

In [35]:
from keras.layers import Input, Dense, Flatten, concatenate, Dropout, Embedding, SpatialDropout1D
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.models import Model
from keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def breakInput(X1):
    X2 = []
    i = 0
    for n in [n_es1, n_es2, n_prs, n_rd, n_pt, X_cat.shape[1], len(numFeatures)]:
        X2.append(X1[:,i:i+n])
        i += n
    return X2

def getModel(HLs, Drop=0.25, OP=optimizers.Adam()):
    temp = []
    inputs_txt = []
    for n in [n_es1, n_es2, n_prs, n_rd, n_pt]:
        input_txt = Input((n, ))
        X_feat = Dropout(Drop)(input_txt)
        X_feat = Dense(int(n/100), activation="linear")(X_feat)
        X_feat = Dropout(Drop)(X_feat)
        temp.append(X_feat)
        inputs_txt.append(input_txt)

    x_1 = concatenate(temp)
#     x_1 = Dense(20, activation="relu")(x_1)
    x_1 = Dense(50, activation="relu")(x_1)
    x_1 = Dropout(Drop)(x_1)

    input_cat = Input((X_cat.shape[1], ))
    x_2 = Embedding(2, 10, input_length=X_cat.shape[1])(input_cat)
    x_2 = SpatialDropout1D(Drop)(x_2)
    x_2 = Flatten()(x_2)

    input_num = Input((len(numFeatures), ))
    x_3 = Dropout(Drop)(input_num)
    
    x = concatenate([x_1, x_2, x_3])

    for HL in HLs:
        x = Dense(HL, activation="relu")(x)
        x = Dropout(Drop)(x)

    output = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs_txt+[input_cat, input_num], outputs=output)
    model.compile(
            optimizer=OP,
            loss='binary_crossentropy',
            metrics=['binary_accuracy'])
    return model

def trainNN(X_train, X_val, Tar_train, Tar_val, HL=[50], Drop=0.5, OP=optimizers.Adam()):
    file_path='NN.h5'
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')
    early = EarlyStopping(monitor="val_loss", mode="min", patience=6)
    lr_reduced = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.5,
                                   patience=2,
                                   verbose=1,
                                   epsilon=3e-4,
                                   mode='min')

    model = getModel(HL, Drop, OP)
    model.fit(breakInput(X_train), Tar_train, validation_data=(breakInput(X_val), Tar_val),
                        verbose=2, epochs=50, batch_size=1000, callbacks=[early, lr_reduced, checkpoint])
    model.load_weights(file_path)
    return model

params_xgb = {
        'eta': 0.05,
        'max_depth': 4,
        'subsample': 0.85,
        'colsample_bytree': 0.25,
        'min_child_weight': 3,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'seed': 0,
        'silent': 1,
    }
params_lgb = {
        'boosting_type': 'dart',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 10,
        'learning_rate': 0.05,
        'feature_fraction': 0.25,
        'bagging_fraction': 0.85,
        'seed': 0,
        'verbose': 0,
    }
nCV = 1 # should be ideally larger
for i in range(21, 22):
    gc.collect()
    X_train, X_val, Tar_train, Tar_val = train_test_split(Xtr, Ttr_tar, test_size=0.15, random_state=i, stratify=Ttr_tar)
    # XGB
    dtrain = xgb.DMatrix(X_train, label=Tar_train)
    dval   = xgb.DMatrix(X_val, label=Tar_val)
    watchlist = [(dtrain, 'train'), (dval, 'valid')]
    model = xgb.train(params_xgb, dtrain, 5000,  watchlist, maximize=True, verbose_eval=200, early_stopping_rounds=200)
    Yvl1 = model.predict(dval)
    Yts1 = model.predict(xgb.DMatrix(Xts))
    # LGB
    dtrain = lgb.Dataset(X_train, Tar_train)
    dval   = lgb.Dataset(X_val, Tar_val)
    model = lgb.train(params_lgb, dtrain, num_boost_round=10000, valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=200)
    Yvl2 = model.predict(X_val)
    Yts2 = model.predict(Xts)
    # NN
    model = trainNN(X_train, X_val, Tar_train, Tar_val, HL=[50], Drop=0.5, OP=optimizers.Adam())
    Yvl3 = model.predict(breakInput(X_val)).squeeze()
    Yts3 = model.predict(breakInput(Xts)).squeeze()
    # stack
    M = LinearRegression()
    M.fit(pl.array([Yvl1, Yvl2, Yvl3]).T, Tar_val)
    Yts.append(M.predict(pl.array([Yts1, Yts2, Yts3]).T))

ModuleNotFoundError: No module named 'keras'