In [1]:
from pathlib import Path
from models import randomize
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from utils import root_path, submit
%matplotlib inline


## Data description
The data for this competition includes questions and answers from various StackExchange properties. Your task is to predict target values of 30 labels for each question-answer pair.

The list of 30 target labels are the same as the column names in the `sample_submission.csv` file. Target labels with the prefix `question_` relate to the `question_title` and/or `question_body` features in the data. Target labels with the prefix `answer_` relate to the `answer` feature.

Each row contains a single question and a single answer to that question, along with additional features. The training data contains rows with some duplicated questions (but with different answers). The test data does not contain any duplicated questions.

This is not a binary prediction challenge. Target labels are aggregated from multiple raters, and can have continuous values in the range [0,1]. Therefore, predictions must also be in that range.

Since this is a synchronous re-run competition, you only have access to the Public test set. For planning purposes, the re-run test set is no larger than 10,000 rows, and less than 8 Mb uncompressed.

Additional information about the labels and collection method will be provided by the competition sponsor in the forum.


In [2]:
current_path = %pwd
current_path = Path(current_path)
competition = current_path.name
competition

'google-quest-challenge'

In [3]:
sample_submission = pd.read_csv(root_path / 'data' / competition / 'sample_submission.csv')
sample_submission

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,...,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308
1,46,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,...,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448
2,70,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,...,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673
3,132,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,...,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401
4,200,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,...,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,9569,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,...,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159,0.99159
472,9590,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,...,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355,0.99355
473,9597,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,...,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467,0.99467
474,9623,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,...,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720,0.99720


In [4]:
required_output_cols = sample_submission.columns

In [5]:
test = pd.read_csv(root_path / 'data' / competition / 'test.csv')
test

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com
...,...,...,...,...,...,...,...,...,...,...,...
471,9569,change gb4e enumeration in only one chapter,I am using gb4e for glossing examples in my th...,ToddAO,https://tex.stackexchange.com/users/28332,You can simply reset the exx counter just afte...,karlkoeller,https://tex.stackexchange.com/users/27635,http://tex.stackexchange.com/questions/165120/...,TECHNOLOGY,tex.stackexchange.com
472,9590,All anagrams in a File,Source : Microsoft Interview Question\n\nWe ar...,Spandan,https://stackoverflow.com/users/2426101,Slightly different approach from the one above...,Algorithmatic,https://stackoverflow.com/users/1122229,http://stackoverflow.com/questions/16872513/al...,STACKOVERFLOW,stackoverflow.com
473,9597,SQL Server: Time Series Table Design,I am creating a time series database that foll...,Jay Michael,https://dba.stackexchange.com/users/47001,A foreign key can reference a unique constrain...,mustaccio,https://dba.stackexchange.com/users/23721,http://dba.stackexchange.com/questions/105287/...,TECHNOLOGY,dba.stackexchange.com
474,9623,Is a countered flashback spell exiled?,If I flashback Past in Flames and it is counte...,BolasStone,https://boardgames.stackexchange.com/users/9981,If you counter a spell cast for its Flashback ...,murgatroid99,https://boardgames.stackexchange.com/users/2537,http://boardgames.stackexchange.com/questions/...,CULTURE,boardgames.stackexchange.com


Apply model, get the result, and save to a csv file (e.g. submit.csv)

In [6]:
fancy_model = randomize
res = fancy_model(test)
res


Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.420991,0.191689,0.233118,0.689187,0.315511,0.629552,0.111566,0.413336,0.775988,...,0.862452,0.911884,0.837652,0.953939,0.490797,0.733951,0.264721,0.723302,0.394963,0.977947
1,46,0.141090,0.229128,0.546087,0.725685,0.102380,0.040920,0.937110,0.697550,0.952831,...,0.377727,0.264327,0.396338,0.959797,0.566860,0.454481,0.585681,0.923369,0.275434,0.323898
2,70,0.452933,0.772471,0.683988,0.210053,0.069751,0.029719,0.785332,0.994157,0.497367,...,0.597127,0.032976,0.359027,0.850592,0.806276,0.677763,0.191349,0.226455,0.156682,0.199483
3,132,0.191591,0.598225,0.553295,0.050814,0.661722,0.997062,0.001917,0.967233,0.588380,...,0.554013,0.061253,0.645580,0.093666,0.475542,0.735587,0.610610,0.604145,0.419217,0.948448
4,200,0.132151,0.201762,0.501821,0.185154,0.480535,0.809798,0.870019,0.626679,0.018449,...,0.551334,0.064291,0.388030,0.802857,0.827039,0.157977,0.769698,0.148364,0.666869,0.232333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,9569,0.326766,0.372678,0.412933,0.366119,0.658064,0.843966,0.913831,0.261182,0.812788,...,0.654445,0.273635,0.647967,0.893476,0.884335,0.923475,0.804289,0.966138,0.944204,0.831074
472,9590,0.631300,0.066578,0.828961,0.753578,0.499143,0.469609,0.794885,0.080574,0.185392,...,0.541153,0.540704,0.925795,0.658081,0.344839,0.649386,0.483351,0.555215,0.683917,0.549085
473,9597,0.375078,0.263373,0.521021,0.500838,0.781321,0.212255,0.525739,0.749885,0.060289,...,0.593649,0.082381,0.504240,0.307926,0.428977,0.395573,0.688909,0.520987,0.598065,0.091502
474,9623,0.206661,0.746394,0.308640,0.122427,0.944730,0.525714,0.124363,0.621815,0.303351,...,0.907066,0.344855,0.944256,0.662608,0.007933,0.432228,0.352409,0.874081,0.426097,0.810868


In [7]:
# Check if outputs are as expected before submit
assert res.columns.isin(required_output_cols).all()

Save to csv file

In [8]:
res.to_csv(current_path / 'submit.csv', index=False)

Submit to kaggle

In [9]:
submit(competition)

