## Doc2Vec

In [1]:
import os
import glob
import pickle
import string
from datetime import datetime
from tqdm import tqdm, notebook

from cleansing import clean_text

import pandas as pd
import numpy as np
import scipy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andreas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/andreas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
pd.set_option("display.max_rows", 10)

In [3]:
tqdm.pandas()

  from pandas import Panel


In [4]:
from nltk.corpus import stopwords

In [5]:
split_folders = glob.glob('../data/cross_validation_data/*')

In [6]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
model = Doc2Vec.load("docvec")

In [7]:
def read_csv(path):
    d_data = pd.read_csv(path, sep='\t')
    
    return d_data

def remove_row_nan(df):
    df = df.dropna(axis = 0)
    return df

table = str.maketrans('', '', string.punctuation)

def remove_punctuation(text):
    return text.translate(table)

def simple_cleansing(text):
    text = text.lower()
    text = remove_punctuation(text)
    stopword = stopwords.words('english')
    word_list = text.split()
    word_clean = [word for word in word_list if word not in stopword]
    text = " ".join(word_clean)
    
    return text

In [8]:
def metrics(y_true, y_pred):
    accu = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    reca = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return {"accuracy": accu, "precision": prec, "recall": reca, "f1_score": f1}

In [11]:
test = False

In [13]:
score_list = []
for split_index, path in enumerate(split_folders, 1):
    test_path, train_path = glob.glob(os.path.join(path, '*'))
    print("read data")
    d_train, d_test = read_csv(train_path), read_csv(test_path)
    print("remove nan")
    d_train, d_test = remove_row_nan(d_train), remove_row_nan(d_test)
    
    ## shuffle row data
    d_train = d_train.sample(frac=1)
    d_test = d_test.sample(frac=1)
    
    ## reset index
    d_train.reset_index(inplace=True)
    d_test.reset_index(inplace=True)
    
    if test:
        d_train = d_train.loc[:99, :]
        d_test = d_test.loc[:99, :]
    
    ## cleansing step
    print("cleansing...")
    d_train["q1_clean"] = d_train.question1.progress_apply(clean_text)
    d_train["q2_clean"] = d_train.question2.progress_apply(clean_text)
    d_test["q1_clean"] = d_test.question1.progress_apply(clean_text)
    d_test["q2_clean"] = d_test.question2.progress_apply(clean_text)
    
    print("tokenizing...")
    d_train["q1_token"] = d_train.q1_clean.progress_apply(word_tokenize)
    d_train["q2_token"] = d_train.q2_clean.progress_apply(word_tokenize)
    d_test["q1_token"] = d_test.q1_clean.progress_apply(word_tokenize)
    d_test["q2_token"] = d_test.q2_clean.progress_apply(word_tokenize)
    
    print("transforming...")
    d_train["q1_vector"] = d_train.q1_token.progress_apply(model.infer_vector)
    d_train["q2_vector"] = d_train.q2_token.progress_apply(model.infer_vector)
    d_test["q1_vector"] = d_test.q1_token.progress_apply(model.infer_vector)
    d_test["q2_vector"] = d_test.q2_token.progress_apply(model.infer_vector)
    
    break

read data


  0%|          | 0/323428 [00:00<?, ?it/s]

remove nan
cleansing...


100%|██████████| 323428/323428 [02:24<00:00, 2231.45it/s]
100%|██████████| 323428/323428 [02:27<00:00, 2197.21it/s]
100%|██████████| 80859/80859 [00:36<00:00, 2187.18it/s]
100%|██████████| 80859/80859 [00:37<00:00, 2152.32it/s]
  0%|          | 879/323428 [00:00<00:36, 8779.46it/s]

tokenizing...


100%|██████████| 323428/323428 [00:34<00:00, 9286.98it/s]
100%|██████████| 323428/323428 [00:34<00:00, 9326.24it/s]
100%|██████████| 80859/80859 [00:08<00:00, 9535.36it/s]
100%|██████████| 80859/80859 [00:08<00:00, 9120.02it/s]
  0%|          | 128/323428 [00:00<04:12, 1279.12it/s]

transforming...


100%|██████████| 323428/323428 [03:53<00:00, 1382.20it/s]
100%|██████████| 323428/323428 [03:53<00:00, 1386.81it/s]
100%|██████████| 80859/80859 [00:59<00:00, 1362.98it/s]
100%|██████████| 80859/80859 [00:59<00:00, 1359.20it/s]


In [15]:
pickle.dump((d_train, d_test), open('data_transformed.pkl', 'wb'))

In [18]:
d_train.shape

(323428, 13)

In [19]:
d_test.shape

(80859, 13)

In [29]:
d_data = pd.concat((d_train, d_test), axis = 0)

In [30]:
d_data.reset_index(drop=True, inplace=True)

In [31]:
pickle.dump(d_data, open('../data/transformed/data.pkl', 'wb'))

In [32]:
d_data.shape

(404287, 13)