In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import math
from tqdm import tqdm

  from numpy.core.umath_tests import inner1d


In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question
0,415,reflected sunlight,absorbed light from Earth's atmosphere,gases in the Moon's interior,volcanic eruptions on the Moon's surface,A,The Moon is visible to observers on Earth beca...
1,158,grasses -> trees -> bushes,trees -> bushes -> grasses,bushes -> grasses -> trees,grasses -> bushes -> trees,D,Which order of succession of natural communiti...
2,1959,an ion,a nucleus,a neutron,an electron,B,If the solar system were used as a model of an...
3,2542,Gravity converts solid matter into gases and l...,Gravity causes gases and dust particles to con...,Gravity cools gases and liquids until they bec...,Gravity pushes rocks and dust particles outwar...,B,Which of the following statements best describ...
4,1059,centimeters,grams,liters,degrees Celsius,A,Which unit of measurement can be used to descr...


In [4]:
df['correctAnswer'] = pd.factorize(df['correctAnswer'])[0]

In [5]:
stop_words = set(stopwords.words("english"))

In [6]:
df['tokenizedQuestion'] = df.apply(lambda row: row['question'].translate(str.maketrans('', '', string.punctuation)), axis = 1)
df['tokenizedQuestion'] = df['tokenizedQuestion'].str.lower()

In [7]:
df['tokenizedQuestion'] = df.apply(lambda row: word_tokenize(row['tokenizedQuestion']), axis=1)

In [8]:
df['tokenizedQuestion'] = df.apply(lambda row: [w for w in row['tokenizedQuestion'] if not w in stop_words], axis = 1)

In [9]:
df['cleanQuestion'] = df.apply(lambda row: ' '.join(row['tokenizedQuestion']), axis = 1)

In [12]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [13]:
len_padding = 100
def _paddingWE(words, w2v_model, len_padding):

    len_vec = w2v_model.wv.vector_size

    padding = np.full((len_padding, len_vec), np.nan)
    cut_words_at = np.min((len(words), len_padding))
    words = words[:cut_words_at]
    non_zero_word_index = []
    for i,w in enumerate(words):
        if w in w2v_model.wv:
            non_zero_word_index.append(i)
            vec = w2v_model.wv[w].ravel()
            padding[i,:] = vec
    return (padding, np.array(non_zero_word_index))
len_vec = w2v_model.wv.vector_size
clean_words = df['tokenizedQuestion'].values
WE_padding = np.zeros((len(clean_words), len_padding, len_vec))
non_zero_word_index_ = []
for i in range(len(clean_words)):
    WE_padding[i,:,:], non_zero_array = _paddingWE(clean_words[i], w2v_model, len_padding)
    non_zero_word_index_.append(non_zero_array)

  app.launch_new_instance()
  after removing the cwd from sys.path.
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [14]:
def _WE2WEAgg(input_X=None):

    values_to_take = 500
    init_ = 0 
    loop_ = math.ceil(input_X.shape[0]/values_to_take)

    for _ in tqdm(range(loop_)):
        if(init_ == 0):
            WEmean = np.nanmean(input_X[init_:(init_+values_to_take)],axis=1)
            WEmin = np.nanmin(input_X[init_:(init_+values_to_take)],axis=1)
            WEmax = np.nanmax(input_X[init_:(init_+values_to_take)],axis=1)
        else:
            WEmean = np.concatenate((WEmean , np.nanmean(input_X[init_:(init_+values_to_take)],axis=1)), axis=0)
            WEmin = np.concatenate((WEmin , np.nanmin(input_X[init_:(init_+values_to_take)],axis=1)), axis=0)
            WEmax = np.concatenate((WEmax , np.nanmax(input_X[init_:(init_+values_to_take)],axis=1)), axis=0)
        init_ = init_ + values_to_take
    WEAgg = np.hstack((WEmean, WEmin, WEmax))
    WEAgg = np.nan_to_num(WEAgg)
   
    return WEAgg

def _fit_WEAggStacking(input_X=None, input_y=None, stacking_model=None, n_top_WEfeatures=None):

    WEAgg = _WE2WEAgg(input_X)

    stacking_model.fit(WEAgg, input_y)

    WEStacking_model_ = stacking_model

    def _get_top_features(model, n_top_WEfeatures=None):
        feature_names = ['WEAgg%d'%i for i in range(len(model.feature_importances_))]
        topFeatures = pd.DataFrame({'feature':feature_names,'importance':model.feature_importances_})
        topFeatures = topFeatures.sort_values('importance', ascending=False)
        selected_features = topFeatures.feature.tolist()[:n_top_WEfeatures]
        return selected_features            
    topWEfeatures_ = _get_top_features(WEStacking_model_, n_top_WEfeatures)
    
    return topWEfeatures_

def _transform_WEAggStacking(input_X=None, stacking_model=None):
    WEAgg = _WE2WEAgg(input_X)
    predictionWEAgg = stacking_model.predict_proba(WEAgg)[:,1]
    return predictionWEAgg

def _initialize_stackingModel():
    rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2, min_samples_leaf=50)
    return rf
    
# colList = ['answerA', 'answerB', 'answerC', 'answerD']
rf = _initialize_stackingModel()
topWEfeatures_ = _fit_WEAggStacking(input_X=WE_padding, input_y=df['correctAnswer'].values, stacking_model=rf, n_top_WEfeatures=200)

predictionWEAgg = _transform_WEAggStacking(WE_padding, stacking_model=rf)
WEAgg_df = pd.DataFrame(_WE2WEAgg(WE_padding), columns=['WEAgg%d'%i for i in range(len(rf.feature_importances_))])
assert predictionWEAgg.shape[0]==df.shape[0]
df['predictionWEAgg'] = predictionWEAgg
for name in topWEfeatures_:
    df[name] = WEAgg_df[name].values

  del sys.path[0]
  
  from ipykernel import kernelapp as app
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.63it/s]


building tree 1 of 200building tree 2 of 200building tree 3 of 200building tree 4 of 200building tree 5 of 200building tree 6 of 200building tree 7 of 200building tree 8 of 200







building tree 9 of 200building tree 10 of 200
building tree 11 of 200building tree 12 of 200


building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200building tree 19 of 200

building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200building tree 24 of 200

building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.3s


building tree 34 of 200
building tree 35 of 200building tree 36 of 200

building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
building tree 43 of 200
building tree 44 of 200
building tree 45 of 200
building tree 46 of 200
building tree 47 of 200building tree 48 of 200

building tree 49 of 200building tree 50 of 200

building tree 51 of 200
building tree 52 of 200
building tree 53 of 200
building tree 54 of 200
building tree 55 of 200
building tree 56 of 200
building tree 57 of 200
building tree 58 of 200
building tree 59 of 200
building tree 60 of 200
building tree 61 of 200
building tree 62 of 200
building tree 63 of 200
building tree 64 of 200
building tree 65 of 200building tree 66 of 200

building tree 67 of 200
building tree 68 of 200
building tree 69 of 200
building tree 70 of 200
building tree 71 of 200
building tree 72 of 200
building tree 73 of 200
building tree 74 of 200
building tree 75

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    1.8s



building tree 155 of 200
building tree 156 of 200
building tree 157 of 200
building tree 158 of 200
building tree 159 of 200building tree 160 of 200
building tree 161 of 200

building tree 162 of 200
building tree 163 of 200
building tree 164 of 200
building tree 165 of 200
building tree 166 of 200
building tree 167 of 200
building tree 168 of 200
building tree 169 of 200
building tree 170 of 200building tree 171 of 200

building tree 172 of 200
building tree 173 of 200
building tree 174 of 200
building tree 175 of 200
building tree 176 of 200
building tree 177 of 200
building tree 178 of 200
building tree 179 of 200
building tree 180 of 200
building tree 181 of 200building tree 182 of 200

building tree 183 of 200
building tree 184 of 200
building tree 185 of 200
building tree 186 of 200
building tree 187 of 200
building tree 188 of 200
building tree 189 of 200
building tree 190 of 200
building tree 191 of 200
building tree 192 of 200
building tree 193 of 200
building tree 194 of 200

[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.4s finished
  del sys.path[0]
  
  from ipykernel import kernelapp as app
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.75it/s]
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.62it/s]


In [16]:
df.head()

Unnamed: 0,id,answerA,answerB,answerC,answerD,correctAnswer,question,tokenizedQuestion,cleanQuestion,predictionWEAgg,...,WEAgg805,WEAgg201,WEAgg448,WEAgg848,WEAgg876,WEAgg471,WEAgg668,WEAgg169,WEAgg54,WEAgg531
0,415,reflected sunlight,absorbed light from Earth's atmosphere,gases in the Moon's interior,volcanic eruptions on the Moon's surface,0,The Moon is visible to observers on Earth beca...,"[moon, visible, observers, earth]",moon visible observers earth,0.214461,...,0.116699,0.038818,-0.141602,0.242188,0.092773,-0.243164,0.131836,0.058044,-0.048264,-0.192383
1,158,grasses -> trees -> bushes,trees -> bushes -> grasses,bushes -> grasses -> trees,grasses -> bushes -> trees,1,Which order of succession of natural communiti...,"[order, succession, natural, communities, woul...",order succession natural communities would lik...,0.314656,...,0.186523,0.065546,-0.086914,0.294922,0.12793,-0.355469,0.114746,0.031741,-0.038086,-0.115234
2,1959,an ion,a nucleus,a neutron,an electron,2,If the solar system were used as a model of an...,"[solar, system, used, model, atom, would, sun,...",solar system used model atom would sun likely ...,0.228687,...,0.186523,0.010437,-0.335938,0.239258,0.121094,-0.439453,0.083008,-0.014871,-0.006239,-0.140625
3,2542,Gravity converts solid matter into gases and l...,Gravity causes gases and dust particles to con...,Gravity cools gases and liquids until they bec...,Gravity pushes rocks and dust particles outwar...,2,Which of the following statements best describ...,"[following, statements, best, describes, role,...",following statements best describes role gravi...,0.240039,...,0.223633,0.049309,-0.244141,0.25,0.07666,-0.263672,0.140625,0.050194,0.032819,-0.194336
4,1059,centimeters,grams,liters,degrees Celsius,0,Which unit of measurement can be used to descr...,"[unit, measurement, used, describe, length, desk]",unit measurement used describe length desk,0.230302,...,0.204102,0.09638,-0.240234,0.349609,0.120117,-0.330078,0.310547,0.022502,0.062093,-0.296875
