In [1]:
import numpy as np
import pandas as pd
import re

train = pd.read_csv("data/train.csv")

In [2]:
def decontract(text):
    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)
    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(I|i)sn(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)
    return text

def clean_apostrophes(x):
    apostrophes = ["’", "‘", "´", "`"]
    for s in apostrophes:
        x = re.sub(s, "'", x)
    return x

def clean_text(x):
    x = str(x)
    
    x = x.lower()
    x = clean_apostrophes(x)
    x = decontract(x)
    
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

train["question_title"] = train["question_title"].apply(lambda x: clean_text(x))
train["question_body"] = train["question_body"].apply(lambda x: clean_text(x))
train["answer"] = train["answer"].apply(lambda x: clean_text(x))


In [3]:
def convert_continuous_to_category(train_labels, threshold=0.5):
    converted = []
    for idx, row in train_labels.iterrows():
        labels = ",".join([col for col in train_labels.columns if row[col] > threshold])
        converted.append(labels)
    return converted

In [4]:
def merge_title_body_answer(dataframe):
    formatter = "{qt}\n{qb}\n{qa}"
    return [formatter.format(qt=row["question_title"], qb=row["question_body"], qa=row["answer"])
           for i, row in dataframe.iterrows()]

In [5]:
train_text = pd.DataFrame({"qa_id":train["qa_id"]})

In [6]:
train_text.insert(len(train_text.columns), "text", merge_title_body_answer(train))
# train_text.insert(len(train_text.columns), "labels", convert_continuous_to_category(train[train.columns[11:]]))

In [7]:
train_text[train.columns[11:]] = train[train.columns[11:]]

In [8]:
train_text

Unnamed: 0,qa_id,text,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,what am i losing when using extension tubes in...,1.000000,0.333333,0.000000,0.000000,0.000000,0.0,1.000000,1.000000,...,1.000000,1.000000,0.666667,1.000000,1.000000,0.800000,1.000000,0.000000,0.000000,1.000000
1,1,what is the distinction between a city and a s...,1.000000,1.000000,0.000000,0.500000,1.000000,1.0,0.444444,0.444444,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.000000,0.000000,0.666667,0.888889
2,2,maximum protusion length for through hole comp...,0.888889,0.666667,0.000000,1.000000,1.000000,1.0,0.666667,0.444444,...,0.777778,0.777778,0.555556,1.000000,1.000000,0.666667,0.000000,0.333333,1.000000,0.888889
3,3,can an affidavit be used in beit din\nan affid...,0.888889,0.666667,0.666667,1.000000,1.000000,1.0,0.444444,0.444444,...,0.888889,0.833333,0.333333,0.833333,1.000000,0.800000,0.000000,0.000000,1.000000,1.000000
4,5,how do you make a binary image in photoshop\ni...,1.000000,0.666667,0.000000,1.000000,1.000000,1.0,0.666667,0.666667,...,1.000000,1.000000,0.666667,1.000000,1.000000,0.800000,1.000000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6074,9642,using a ski helmet for winter biking\ni am cur...,1.000000,0.777778,0.000000,1.000000,1.000000,1.0,0.888889,0.888889,...,0.777778,1.000000,0.555556,1.000000,1.000000,0.866667,0.000000,0.000000,0.000000,0.888889
6075,9643,adjustment to road bike brakes for high grade ...,1.000000,0.777778,0.000000,1.000000,0.666667,0.5,0.777778,0.777778,...,0.777778,0.888889,0.555556,1.000000,1.000000,0.733333,0.666667,0.333333,0.000000,0.888889
6076,9645,suppress file truncated messages when using ...,0.888889,0.555556,0.000000,0.666667,0.333333,1.0,0.444444,0.333333,...,0.444444,0.888889,0.555556,0.888889,0.888889,0.800000,1.000000,0.000000,0.333333,0.555556
6077,9646,when should a supervisor be a co author\nwhat ...,1.000000,0.444444,0.333333,0.000000,0.000000,0.0,0.777778,0.555556,...,1.000000,0.888889,0.555556,1.000000,1.000000,0.533333,0.000000,0.333333,0.666667,1.000000


In [9]:
train_text.to_csv("data/train_processed.csv", index=False)

In [10]:
test = pd.read_csv("data/test.csv")
test_processed = pd.DataFrame({"qa_id": test["qa_id"]})
test_processed.insert(len(test_processed.columns), "text", merge_title_body_answer(test))
test_processed.to_csv("data/test_processed.csv", index=False)

In [20]:
lens  = np.unique([len(x["text"]) for i, x in train_text.iterrows()], return_counts=True)

In [22]:
lens

(array([   90,   154,   171, ..., 17122, 17307, 19338]),
 array([1, 1, 1, ..., 1, 1, 1]))

In [26]:
np.mean(lens[0])

2184.183523354174