In [None]:
from pathlib import Path

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline



## Data description
The data for this competition includes questions and answers from various StackExchange properties. Your task is to predict target values of 30 labels for each question-answer pair.

The list of 30 target labels are the same as the column names in the `sample_submission.csv` file. Target labels with the prefix `question_` relate to the `question_title` and/or `question_body` features in the data. Target labels with the prefix `answer_` relate to the `answer` feature.

Each row contains a single question and a single answer to that question, along with additional features. The training data contains rows with some duplicated questions (but with different answers). The test data does not contain any duplicated questions.

This is not a binary prediction challenge. Target labels are aggregated from multiple raters, and can have continuous values in the range [0,1]. Therefore, predictions must also be in that range.

Since this is a synchronous re-run competition, you only have access to the Public test set. For planning purposes, the re-run test set is no larger than 10,000 rows, and less than 8 Mb uncompressed.

Additional information about the labels and collection method will be provided by the competition sponsor in the forum.

In [None]:
plt.rcParams['figure.figsize'] = 20, 10

In [None]:
current_path = %pwd
current_path = Path(current_path)
root_path = current_path.parent

In [None]:
train = pd.read_csv(root_path / 'data' / 'google-quest-challenge' / 'train.csv')
train

In [None]:
last_30_cols = train.columns[-30:]
question_labels = [q for q in last_30_cols if q.startswith('question_')]
answer_labels = [a for a in last_30_cols if a.startswith('answer_')]
print(question_labels)
print(answer_labels)

In [None]:
sns.set()
fig, axes = plt.subplots(6, 5, figsize=(18, 15))
axes = axes.ravel()
bins = np.linspace(0, 1, 20)

for i, col in enumerate(last_30_cols):
    ax = axes[i]
    sns.distplot(train[col], label=col, kde=False, bins=bins, ax=ax)
    # ax.set_title(col)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 6079])
plt.tight_layout()
plt.show()
plt.close()

In [None]:
train[question_labels].describe()


In [None]:
for this_label in answer_labels:
    ax = sns.distplot(train[this_label], kde=False, norm_hist=True)
    plt.show()

In [None]:
train[answer_labels].describe()



In [None]:
plt.rcParams['figure.figsize'] = 15, 30
train['host'].value_counts(ascending=True).plot.barh()

In [None]:
plt.rcParams['figure.figsize'] = 15, 30
train['category'].value_counts(ascending=True).plot.barh()
