## IMPORT PACKAGES & LIBRARIES

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from scipy.sparse import hstack
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook
from scipy import stats
import gc

## IMPORT & CLEAN DATA

In [3]:
train = pd.read_csv('./data/train.csv').fillna(' ')
test = pd.read_csv('./data/test.csv').fillna(' ')
sample_submission = pd.read_csv('./data/sample_submission.csv').fillna(' ')

In [4]:
train_question_body = train['question_body']
test_question_body = test['question_body']
all_question_body = pd.concat([train_question_body, test_question_body])
train_answer = train['answer']
test_answer = test['answer']
all_answer = pd.concat([train_answer, test_answer])
train_title = train['question_title']
test_title = test['question_title']
all_title = pd.concat([train_title, test_title])
all_title.head()

0    What am I losing when using extension tubes in...
1    What is the distinction between a city and a s...
2    Maximum protusion length for through-hole comp...
3                Can an affidavit be used in Beit Din?
4         How do you make a binary image in Photoshop?
Name: question_title, dtype: object

In [5]:
categories = list(sample_submission.columns[1:])
question_category = categories[:21]
answer_categories = categories[21:]
updated_categories = [class_name + '_2' for class_name in categories]
for category in categories:
    train[category + '_2'] = (train[category].values >= 0.5) * 1
categories

['question_asker_intent_understanding',
 'question_body_critical',
 'question_conversational',
 'question_expect_short_answer',
 'question_fact_seeking',
 'question_has_commonly_accepted_answer',
 'question_interestingness_others',
 'question_interestingness_self',
 'question_multi_intent',
 'question_not_really_a_question',
 'question_opinion_seeking',
 'question_type_choice',
 'question_type_compare',
 'question_type_consequence',
 'question_type_definition',
 'question_type_entity',
 'question_type_instructions',
 'question_type_procedure',
 'question_type_reason_explanation',
 'question_type_spelling',
 'question_well_written',
 'answer_helpful',
 'answer_level_of_information',
 'answer_plausible',
 'answer_relevance',
 'answer_satisfaction',
 'answer_type_instructions',
 'answer_type_procedure',
 'answer_type_reason_explanation',
 'answer_well_written']

## MODELING

In [11]:
word_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=80000)

word_vector.fit(all_question_body)
train_feature_1 = word_vector.transform(train_question_body)
test_feature_1 = word_vector.transform(test_question_body)
print(train_feature_1)

  (0, 76150)	0.08315981586818058
  (0, 76148)	0.03452764863131048
  (0, 74185)	0.11372399622820195
  (0, 73747)	0.0329701950261642
  (0, 73044)	0.11372399622820195
  (0, 72971)	0.04376460566084368
  (0, 69963)	0.11372399622820195
  (0, 69962)	0.11372399622820195
  (0, 69961)	0.11372399622820195
  (0, 69960)	0.22463246754517688
  (0, 65675)	0.10465244403365306
  (0, 65662)	0.08403263812247032
  (0, 62181)	0.11372399622820195
  (0, 62179)	0.09796702218058759
  (0, 61227)	0.07771332359264357
  (0, 60353)	0.08706633789024978
  (0, 58678)	0.09453333423399796
  (0, 58443)	0.06623245684618298
  (0, 56781)	0.0828917171929866
  (0, 55252)	0.10703857437513647
  (0, 55217)	0.09796702218058759
  (0, 53653)	0.10088740969624999
  (0, 53632)	0.048621196969562394
  (0, 53485)	0.08463484928397604
  (0, 51748)	0.06151823387856895
  :	:
  (6078, 57069)	0.17205315761674697
  (6078, 57067)	0.2408080389006527
  (6078, 56153)	0.17205315761674697
  (6078, 56115)	0.04665639316201022
  (6078, 55932)	0.115527854

In [12]:
#for cross validation, 2 train feature sets.
word_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=80000)

word_vector.fit(all_answer)
train_feature_2 = word_vector.transform(train_answer)
test_feature_2 = word_vector.transform(test_answer)

In [13]:
# for testing
word_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=80000)

word_vector.fit(all_title)
train_feature_3 = word_vector.transform(train_title)
test_feature_3 = word_vector.transform(test_title)

In [14]:
#by character
character_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 4),
    max_features=47000)

character_vector.fit(all_question_body)
train_character_feature_1 = character_vector.transform(train_question_body)
test_character_feature_1 = character_vector.transform(test_question_body)

In [15]:
character_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 4),
    max_features=47000)

character_vector.fit(all_answer)
train_character_feature_2 = character_vector.transform(train_answer)
test_character_feature_2 = character_vector.transform(test_answer)


In [16]:
character_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 4),
    max_features=47000)

character_vector.fit(all_title)
train_character_feature_3 = character_vector.transform(train_title)
test_character_feature_3 = character_vector.transform(test_title)

In [17]:
%%time
train_features_1 = hstack([train_character_feature_1, train_feature_1, train_character_feature_3, train_feature_3])
test_features_1 = hstack([test_character_feature_1, test_feature_1, test_character_feature_3, test_feature_3])
train_features_2 = hstack([train_character_feature_2, train_feature_2])
test_features_2 = hstack([test_character_feature_2, test_feature_2])

Wall time: 305 ms


In [18]:
train_features_1= train_features_1.tocsr()
train_features_2= train_features_2.tocsr()

In [19]:
alpha_value = {'question_asker_intent_understanding': 40,
         'question_body_critical':7,
          'question_conversational':35,
          'question_expect_short_answer':65,
          'question_fact_seeking':10,
          'question_has_commonly_accepted_answer':25,
          'question_interestingness_others':50,
          'question_interestingness_self':30,
          'question_multi_intent':7,
          'question_not_really_a_question':55,
          'question_opinion_seeking':15,
          'question_type_choice':4,
               'question_type_compare':30,
               'question_type_consequence':45,
               'question_type_definition':60,
               'question_type_entity':11,
               'question_type_instructions':6,
               'question_type_procedure':40,
               'question_type_reason_explanation':13,
               'question_type_spelling':1,
               'question_well_written':8,
               'answer_helpful':30,
               'answer_level_of_information':8,
               'answer_plausible':20,
               'answer_relevance':60,
               'answer_satisfaction':11,
               'answer_type_instructions':3,
               'answer_type_procedure':25,
               'answer_type_reason_explanation':3,
               'answer_well_written':25
               }

In [20]:
def spearman_correlation(y, y_prediction):
    if np.ndim(y_prediction) != 2:
        correlation = stats.spearmanr(y, y_prediction)[0]
    else:
        correlation = np.mean([stats.spearmanr(y[:, i],
                                               y_prediction[:, i])[0] for i in range(y.shape[1])])
    return correlation

In [21]:
train_prediction = []
test_prediction = []
final_score = []
spearman_scores = []

In [22]:
def train_predict(split_num, train_feature, test_feature, train_prediction, test_prediction_final, final_score, spearman_scores):
    for category in tqdm_notebook(question_category):
        print(category)
        y_train = train[category]
        k_fold = KFold(n_splits=split_num, random_state=47)
        train_oof = np.zeros((train_feature.shape[0],))
        test_prediction = 0

        for index, (train_index, validation_index) in enumerate(k_fold.split(train_feature)):
            train_fea = train_feature[train_index]
            train_goal = y_train[train_index]

            validation_fea = train_feature[validation_index]
            validation_goal = y_train[validation_index]

            model = Ridge(alpha=alpha_value[category])
            model.fit(train_fea, train_goal)
            validation_prediction = model.predict(validation_fea)
            train_oof[validation_index] = validation_prediction

            test_prediction = test_prediction + model.predict(test_feature) / split_num
            del train_fea, train_goal, validation_fea, validation_goal
            gc.collect()

        model = Ridge(alpha=alpha_value[category])
        model.fit(train_feature, y_train)

        minmax_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
        test_prediction = minmax_scaler.fit_transform(test_prediction.reshape(-1, 1)).flatten()
        prediction = model.predict(test_feature)
        minmax_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
        prediction = minmax_scaler.fit_transform(prediction.reshape(-1, 1)).flatten()
        sample_submission[category] = (0.75 * test_prediction + 0.25 * prediction + 0.000005) / 1.00001
        temp_spear_score = spearman_correlation(train[category], train_oof)
        spearman_scores.append(temp_spear_score)
        score = roc_auc_score(train[category + '_2'], train_oof)
        train_prediction.append(train_oof)
        test_prediction_final.append(test_prediction)
        final_score.append(score)
        print("spearman correlation is: ", temp_spear_score)
        print("auc score is: ", score, "\n")
    return;

In [23]:
train_predict(3, train_features_1, test_features_1, train_prediction, test_prediction, final_score, spearman_scores)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

question_asker_intent_understanding
spearman correlation is:  0.3533069088948108
auc score is:  0.6595442990498417 

question_body_critical
spearman correlation is:  0.6218502362952351
auc score is:  0.8055333291045181 

question_conversational
spearman correlation is:  0.38217962093438507
auc score is:  0.8775131307497958 

question_expect_short_answer
spearman correlation is:  0.23001186290285297
auc score is:  0.6386370307582678 

question_fact_seeking
spearman correlation is:  0.31074853413846165
auc score is:  0.6965219325892692 

question_has_commonly_accepted_answer
spearman correlation is:  0.40353730344683264
auc score is:  0.7975552764190871 

question_interestingness_others
spearman correlation is:  0.3470694174305782
auc score is:  0.6464758638080675 

question_interestingness_self
spearman correlation is:  0.48190344627512066
auc score is:  0.7692183513253747 

question_multi_intent
spearman correlation is:  0.4797550736298078
auc score is:  0.7762181891082337 

question_n

In [24]:
train_predict(3, train_features_2, test_features_2, train_prediction, test_prediction, final_score, spearman_scores)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

question_asker_intent_understanding
spearman correlation is:  0.2568589363313856
auc score is:  0.6218411401900317 

question_body_critical
spearman correlation is:  0.3250516894027428
auc score is:  0.6503204460191747 

question_conversational
spearman correlation is:  0.33597167670217526
auc score is:  0.823580182979458 

question_expect_short_answer
spearman correlation is:  0.14590287429343532
auc score is:  0.5931840918715908 

question_fact_seeking
spearman correlation is:  0.21983307738658528
auc score is:  0.6451153450532366 

question_has_commonly_accepted_answer
spearman correlation is:  0.32830421846848606
auc score is:  0.7403095055565353 

question_interestingness_others
spearman correlation is:  0.2812893625684301
auc score is:  0.6281764750035632 

question_interestingness_self
spearman correlation is:  0.40653653737020173
auc score is:  0.7281912067865246 

question_multi_intent
spearman correlation is:  0.2537890474180475
auc score is:  0.6450536941660739 

question_no

In [25]:
print("Mean auc:", np.mean(final_score))
print("Mean spearman_scores", np.mean(spearman_scores))

Mean auc: 0.7495157147032577
Mean spearman_scores 0.3286768122043463


In [26]:
sample_submission

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.639418,0.742046,0.403874,0.741783,0.483068,0.574803,0.613298,0.562991,0.378518,...,0.679445,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308
1,46,0.329173,0.345592,0.151592,0.707064,0.548496,0.732382,0.148041,0.258359,0.257798,...,0.236652,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448
2,70,0.752756,0.617633,0.334269,0.828107,0.652149,0.646870,0.583635,0.546743,0.326725,...,0.630881,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673
3,132,0.522018,0.270841,0.220361,0.684175,0.699514,0.696677,0.462743,0.365237,0.440689,...,0.400006,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401
4,200,0.894465,0.712474,0.702230,0.561335,0.292940,0.247564,0.777040,0.774746,0.777696,...,0.829274,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,9569,0.453129,0.494271,0.291700,0.759272,0.609016,0.697837,0.385219,0.398608,0.425862,...,0.440545,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159
472,9590,0.299736,0.391096,0.140751,0.792002,0.585570,0.797068,0.160703,0.220256,0.219415,...,0.292728,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355
473,9597,0.566950,0.590504,0.334155,0.769069,0.544037,0.563753,0.533719,0.474477,0.361222,...,0.430879,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467
474,9623,0.749350,0.650156,0.561324,0.575811,0.452632,0.380144,0.743549,0.641104,0.605940,...,0.715696,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720
