## IMPORT PACKAGES & LIBRARIES

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from scipy.sparse import hstack
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook
from scipy import stats
import gc

## IMPORT & CLEAN DATA

In [2]:
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')
sample_submission = pd.read_csv('data/sample_submission.csv').fillna(' ')

In [3]:
train_question_body = train['question_body']
test_question_body = test['question_body']
all_question_body = pd.concat([train_question_body, test_question_body])
train_answer = train['answer']
test_answer = test['answer']
all_answer = pd.concat([train_answer, test_answer])
train_title = train['question_title']
test_title = test['question_title']
all_title = pd.concat([train_title, test_title])

In [4]:
categories = list(sample_submission.columns[1:])
question_category = categories[:21]
answer_categories = categories[21:]
updated_categories = [class_name + '_2' for class_name in categories]
for category in categories:
    train[category + '_2'] = (train[category].values >= 0.5) * 1

## MODELING

In [5]:
word_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=80000)

word_vector.fit(all_question_body)
train_feature_1 = word_vector.transform(train_question_body)
test_feature_1 = word_vector.transform(test_question_body)

In [6]:
word_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=80000)

word_vector.fit(all_answer)
train_feature_2 = word_vector.transform(train_answer)
test_feature_2 = word_vector.transform(test_answer)

In [7]:
word_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=80000)

word_vector.fit(all_title)
train_feature_3 = word_vector.transform(train_title)
test_feature_3 = word_vector.transform(test_title)

In [8]:
character_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 4),
    max_features=47000)

character_vector.fit(all_question_body)
train_character_feature_1 = character_vector.transform(train_question_body)
test_character_feature_1 = character_vector.transform(test_question_body)

In [9]:
character_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 4),
    max_features=47000)

character_vector.fit(all_answer)
train_character_feature_2 = character_vector.transform(train_answer)
test_character_feature_2 = character_vector.transform(test_answer)


In [10]:
character_vector = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 4),
    max_features=47000)

character_vector.fit(all_title)
train_character_feature_3 = character_vector.transform(train_title)
test_character_feature_3 = character_vector.transform(test_title)

In [11]:
%%time
train_features_1 = hstack([train_character_feature_1, train_feature_1, train_character_feature_3, train_feature_3])
test_features_1 = hstack([test_character_feature_1, test_feature_1, test_character_feature_3, test_feature_3])
train_features_2 = hstack([train_character_feature_2, train_feature_2])
test_features_2 = hstack([test_character_feature_2, test_feature_2])

Wall time: 314 ms


In [12]:
train_features_1= train_features_1.tocsr()
train_features_2= train_features_2.tocsr()

In [13]:
alpha_value = {'question_asker_intent_understanding': 40,
         'question_body_critical':7,
          'question_conversational':35,
          'question_expect_short_answer':65,
          'question_fact_seeking':10,
          'question_has_commonly_accepted_answer':25,
          'question_interestingness_others':50,
          'question_interestingness_self':30,
          'question_multi_intent':7,
          'question_not_really_a_question':55,
          'question_opinion_seeking':15,
          'question_type_choice':4,
               'question_type_compare':30,
               'question_type_consequence':45,
               'question_type_definition':60,
               'question_type_entity':11,
               'question_type_instructions':6,
               'question_type_procedure':40,
               'question_type_reason_explanation':13,
               'question_type_spelling':1,
               'question_well_written':8,
               'answer_helpful':30,
               'answer_level_of_information':8,
               'answer_plausible':20,
               'answer_relevance':60,
               'answer_satisfaction':11,
               'answer_type_instructions':3,
               'answer_type_procedure':25,
               'answer_type_reason_explanation':3,
               'answer_well_written':25
               }

In [14]:
def spearman_correlation(y, y_prediction):
    if np.ndim(y_prediction) != 2:
        correlation = stats.spearmanr(y, y_prediction)[0]
    else:
        correlation = np.mean([stats.spearmanr(y[:, i],
                                               y_prediction[:, i])[0] for i in range(y.shape[1])])
    return correlation

In [15]:
train_prediction = []
test_prediction = []
final_score = []
spearman_scores = []

In [18]:
def train_predict(split_num, train_feature, test_feature, train_prediction, test_prediction_final, final_score, spearman_scores):
    for category in tqdm_notebook(question_category):
        print(category)
        y_train = train[category]
        k_fold = KFold(n_splits=split_num, random_state=47)
        train_oof = np.zeros((train_feature.shape[0],))
        test_prediction = 0

        for index, (train_index, validation_index) in enumerate(k_fold.split(train_feature)):
            train_fea = train_feature[train_index]
            train_goal = y_train[train_index]

            validation_fea = train_feature[validation_index]
            validation_goal = y_train[validation_index]

            model = Ridge(alpha=alpha_value[category])
            model.fit(train_fea, train_goal)
            validation_prediction = model.predict(validation_fea)
            train_oof[validation_index] = validation_prediction

            test_prediction = test_prediction + model.predict(test_feature) / split_num
            del train_fea, train_goal, validation_fea, validation_goal
            gc.collect()

        model = Ridge(alpha=alpha_value[category])
        model.fit(train_feature, y_train)

        minmax_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
        test_prediction = minmax_scaler.fit_transform(test_prediction.reshape(-1, 1)).flatten()
        prediction = model.predict(test_feature)
        minmax_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
        prediction = minmax_scaler.fit_transform(prediction.reshape(-1, 1)).flatten()
        sample_submission[category] = (0.75 * test_prediction + 0.25 * prediction + 0.000005) / 1.00001
        temp_spear_score = spearman_correlation(train[category], train_oof)
        spearman_scores.append(temp_spear_score)
        score = roc_auc_score(train[category + '_2'], train_oof)
        train_prediction.append(train_oof)
        test_prediction_final.append(test_prediction)
        final_score.append(score)
        print("spearman correlation is: ", temp_spear_score)
        print("auc score is: ", score, "\n")
    return;

In [19]:
train_predict(3, train_features_1, test_features_1, train_prediction, test_prediction, final_score, spearman_scores)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

question_asker_intent_understanding
spearman correlation is:  0.35331292424692584
auc score is:  0.659554717452909 

question_body_critical
spearman correlation is:  0.6218455419603762
auc score is:  0.805533215832684 

question_conversational
spearman correlation is:  0.3821850506076396
auc score is:  0.8775197700624942 

question_expect_short_answer
spearman correlation is:  0.2300177096685173
auc score is:  0.6386376647402069 

question_fact_seeking
spearman correlation is:  0.310753113738873
auc score is:  0.6965229625784336 

question_has_commonly_accepted_answer
spearman correlation is:  0.4035259601169744
auc score is:  0.7975492777843762 

question_interestingness_others
spearman correlation is:  0.3470683577885911
auc score is:  0.6464745987421872 

question_interestingness_self
spearman correlation is:  0.48195071070194473
auc score is:  0.7692451918204427 

question_multi_intent
spearman correlation is:  0.47975368135113466
auc score is:  0.7762200511183852 

question_not_re

In [20]:
train_predict(3, train_features_2, test_features_2, train_prediction, test_prediction, final_score, spearman_scores)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

question_asker_intent_understanding
spearman correlation is:  0.25684271746965787
auc score is:  0.6218203033838974 

question_body_critical
spearman correlation is:  0.32505508961438123
auc score is:  0.6503171611359806 

question_conversational
spearman correlation is:  0.33598616615259763
auc score is:  0.8236061366563697 

question_expect_short_answer
spearman correlation is:  0.14590137855336466
auc score is:  0.5931831408986821 

question_fact_seeking
spearman correlation is:  0.2198318871996968
auc score is:  0.6451132850749076 

question_has_commonly_accepted_answer
spearman correlation is:  0.3283066398037159
auc score is:  0.7403111051924582 

question_interestingness_others
spearman correlation is:  0.2812832744446725
auc score is:  0.6281743665604292 

question_interestingness_self
spearman correlation is:  0.40647857673501303
auc score is:  0.7281624729785264 

question_multi_intent
spearman correlation is:  0.2537903942473625
auc score is:  0.6450557113437382 

question_n

In [21]:
print("Mean auc:", np.mean(final_score))
print("Mean spearman_scores", np.mean(spearman_scores))

Mean auc: 0.7495199825046382
Mean spearman_scores 0.3286759853835606


In [23]:
sample_submission

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.639511,0.742063,0.403959,0.741763,0.483035,0.574780,0.613302,0.562729,0.378547,...,0.679429,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308
1,46,0.329168,0.345632,0.151589,0.707135,0.548457,0.732407,0.148123,0.258102,0.257806,...,0.236659,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448
2,70,0.752663,0.617670,0.334349,0.828120,0.652153,0.646873,0.583643,0.546601,0.326753,...,0.630895,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673
3,132,0.522094,0.270853,0.220435,0.684185,0.699513,0.696654,0.462767,0.365461,0.440701,...,0.400031,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401
4,200,0.894064,0.712503,0.702298,0.561307,0.292910,0.247569,0.777124,0.774520,0.777727,...,0.829298,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,9569,0.453020,0.494239,0.291751,0.759304,0.609025,0.697798,0.385239,0.398367,0.425871,...,0.440542,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159
472,9590,0.300024,0.391158,0.140705,0.792048,0.585552,0.797036,0.160711,0.220027,0.219446,...,0.292716,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355
473,9597,0.567227,0.590521,0.334181,0.769052,0.544018,0.563717,0.533742,0.474279,0.361267,...,0.430876,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467
474,9623,0.749677,0.650162,0.561326,0.575795,0.452625,0.380087,0.743543,0.640820,0.605965,...,0.715712,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720
