In [1]:
import re

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 0)

In [None]:
train_df = pd.read_csv('train_dataset.csv', encoding='utf-8', header=0)
print(train_df.shape)
train_df.head()

In [2]:
test_df = pd.read_csv('test_dataset.csv', encoding='utf-8', header=0)
print(test_df.shape)
test_df.head()

(2479, 2)


Unnamed: 0,ID,query
0,1448,definition of a compound
1,1199,What does safe mode on Kindle Fire mean for amazon?
2,530,How do you cancel a gift card reload on Amazon?
3,2453,who is the santa clara county supervisor for sunnyvale ca
4,1374,caffeine effects on heart


### 1. preprocessing

- missing values
- [] and url

In [None]:
# check missing values
print(train_df.isna().sum())

# train_df.dropna(inplace=True)
# print("------------nan rows removed!-------------")

# print(train_df.isna().sum())

In [3]:
# replace URL

def clean_text(sample):
    
    """Remove URLs from a sample string"""
    sample = re.sub(r"\(http\S+\)", " url", sample)
    sample = re.sub(r"\(HTTP\S+\)", " url", sample)
    sample = re.sub(r"\[http\S+\]", " url", sample)
    sample = re.sub(r"\[HTTP\S+\]", " url", sample)
    sample = re.sub(r"\(www\S+\)", " url", sample)
    sample = re.sub(r"\(WWW\S+\)", " url", sample)
    sample = re.sub(r"\[www\S+\]", " url", sample)
    sample = re.sub(r"\[WWW\S+\]", " url", sample)
    
    """Remove \n from a sample string"""
    sample = re.sub(r'\n', ' ', sample)
    sample = re.sub(r'•', ' ', sample)
    sample = re.sub(r'\[', ' ', sample)
    sample = re.sub(r'\]', ' ', sample)
    sample = re.sub(r'\<', ' ', sample)
    sample = re.sub(r'\>', ' ', sample)
    
    return sample


In [None]:
train_df['cleaned_document'] = train_df.document.map(clean_text)
train_df['cleaned_query'] = None
train_df['cleaned_query'] = train_df['query']
train_df['cleaned_query'] = train_df.cleaned_query.map(clean_text)
train_df = train_df[['cleaned_query', 'cleaned_document', 'label']].copy()
train_df.head()

In [None]:
train_df.sample(20)

In [None]:
# plot label distribution
train_df.query(f'label == 1').shape
train_df.query(f'label == 0').shape

In [None]:
# train_df random 75/25 split
from sklearn.model_selection import train_test_split

seed = 42
test_fraction = 0.15
valid_fraction = 0.15

split_fraction = test_fraction + valid_fraction

# get train, test, and validation splits
df_train, df_test_valid, _, _ = train_test_split(
    train_df, train_df, test_size=split_fraction, random_state=seed
)

df_train.shape

In [None]:
joint = np.int64(test_fraction / split_fraction * len(df_test_valid))
test_df = pd.DataFrame.copy(df_test_valid[:joint])
valid_df = pd.DataFrame.copy(df_test_valid[joint:])

### n. bert base implementation (nlp2 + snli)
https://d2l.ai/chapter_natural-language-processing-applications/natural-language-inference-bert.html#fine-tuning-bert

pretrained: 

fine-tuned: 

In [4]:
!pip install -q --upgrade pip
!pip install -q git+https://github.com/d2l-ai/d2l-en # installing d2
!pip install -q -U mxnet-cu101mkl==1.6.0.post0  # updating mxnet to at least v1.6

In [5]:
from collections import defaultdict
import numpy
numpy.random.seed(123)
import json
import multiprocessing

from d2l import mxnet as d2l

import mxnet as mx
from mxnet import autograd, gluon, init, np, npx
from mxnet.gluon import nn
npx.set_np()

import os
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
d2l.try_all_gpus()
d2l.try_gpu()

gpu(0)

In [7]:
d2l.DATA_HUB['bert.base'] = (d2l.DATA_URL + 'bert.base.zip',
                             '7b3820b35da691042e5d34c0971ac3edbd80d3f4')
d2l.DATA_HUB['bert.small'] = (d2l.DATA_URL + 'bert.small.zip',
                              'a4e718a47137ccd1809c9107ab4f5edd317bae2c')

def load_pretrained_model(pretrained_model, num_hiddens, ffn_num_hiddens,
                          num_heads, num_layers, dropout, max_len, ctx):
    data_dir = d2l.download_extract(pretrained_model)
    # Define an empty vocabulary to load the predefined vocabulary
    vocab = d2l.Vocab([])
    vocab.idx_to_token = json.load(open(os.path.join(data_dir, 'vocab.json')))
    vocab.token_to_idx = {token: idx for idx, token in enumerate(
        vocab.idx_to_token)}
    bert = d2l.BERTModel(len(vocab), num_hiddens, ffn_num_hiddens, num_heads, 
                         num_layers, dropout, max_len)
    # Load pretrained BERT parameters
    bert.load_parameters(os.path.join(data_dir, 'pretrained.params'), ctx=ctx)
    return bert, vocab

ctx = d2l.try_all_gpus()

# Small BERT version 2020-1-13-bertsmall:
bert, vocab = load_pretrained_model(
    'bert.small', num_hiddens=256, ffn_num_hiddens=512, num_heads=4,
    num_layers=2, dropout=0.1, max_len=512, ctx=ctx)


# Small BERT version 2020-1-13-bertsmall12:
# bert, vocab = load_pretrained_model(
#     'bert.small', num_hiddens=256, ffn_num_hiddens=512, num_heads=4,
#     num_layers=12, dropout=0.1, max_len=512, ctx=ctx)

# Base (larger) BERT version. Be aware that this uses a large portion of GPU memory and may produce out of memory errors in most machines. p3 instances should be able to handle it.
# bert, vocab = load_pretrained_model(
#     'bert.base', num_hiddens=768, ffn_num_hiddens=3072, num_heads=12,
#     num_layers=12, dropout=0.1, max_len=512, ctx=ctx)

In [8]:
class AMZNQABERTDataset(gluon.data.Dataset):
    def __init__(self, dataset, max_len, vocab=None):
        
        all_premise_hypothesis_tokens = list()
        question_tokens = [word_tokenize(question.lower()) for question in dataset['cleaned_query'].tolist()]
        context_tokens = [word_tokenize(context.lower()) for context in dataset['cleaned_document'].tolist()]
        all_premise_hypothesis_tokens = list(zip(question_tokens, context_tokens))        
        
#         self.labels = np.array(dataset[2])
        self.labels = np.array(dataset['label'])
        self.vocab = vocab
        self.max_len = max_len
        (self.all_token_ids, self.all_segments,self.valid_lens) = self._preprocess(all_premise_hypothesis_tokens)
        print('read ' + str(len(self.all_token_ids)) + ' examples')

    def _preprocess(self, all_premise_hypothesis_tokens):
        pool = multiprocessing.Pool(4)  # Use 4 worker processes
        out = pool.map(self._mp_worker, all_premise_hypothesis_tokens)
        all_token_ids = [
            token_ids for token_ids, segments, valid_len in out]
        all_segments = [segments for token_ids, segments, valid_len in out]
        valid_lens = [valid_len for token_ids, segments, valid_len in out]
        return (np.array(all_token_ids, dtype='int32'),
                np.array(all_segments, dtype='int32'),
                np.array(valid_lens))

    def _mp_worker(self, premise_hypothesis_tokens):
        p_tokens, h_tokens = premise_hypothesis_tokens
        self._truncate_pair_of_tokens(p_tokens, h_tokens)
        tokens, segments = d2l.get_tokens_and_segments(p_tokens, h_tokens)
        token_ids = self.vocab[tokens] + [self.vocab['<pad>']] \
                             * (self.max_len - len(tokens))
        segments = segments + [0] * (self.max_len - len(segments))
        valid_len = len(tokens)
        return token_ids, segments, valid_len

    def _truncate_pair_of_tokens(self, p_tokens, h_tokens):
        # Reserve slots for '<CLS>', '<SEP>', and '<SEP>' tokens for the BERT
        # input
        while len(p_tokens) + len(h_tokens) > self.max_len - 3:
            if len(p_tokens) > len(h_tokens):
                p_tokens.pop()
            else:
                h_tokens.pop()

    def __getitem__(self, idx):
        return (self.all_token_ids[idx], self.all_segments[idx],
                self.valid_lens[idx]), self.labels[idx]

    def __len__(self):
        return len(self.all_token_ids)

In [None]:
# Reduce `batch_size` if there is an out of memory error. In the original BERT
# model, `max_len` = 512
batch_size, max_len, num_workers = 32, 512, d2l.get_dataloader_workers()
train_set = AMZNQABERTDataset(train_df, max_len, vocab)
test_set = AMZNQABERTDataset(test_df, max_len, vocab)
train_iter = gluon.data.DataLoader(train_set, batch_size, shuffle=True,
                                   num_workers=num_workers)
test_iter = gluon.data.DataLoader(test_set, batch_size,
                                  num_workers=num_workers)

In [9]:
class BERTClassifier(nn.Block):
    def __init__(self, bert):
        super(BERTClassifier, self).__init__()
        self.encoder = bert.encoder
        self.hidden = bert.hidden
        self.output = nn.Dense(2)

    def forward(self, inputs):
        tokens_X, segments_X, valid_lens_x = inputs
        encoded_X = self.encoder(tokens_X, segments_X, valid_lens_x)
        return self.output(self.hidden(encoded_X[:, 0, :]))

In [None]:
net = BERTClassifier(bert)
# net.output.initialize(ctx=devices)
net.output.initialize(ctx=ctx)

In [None]:
lr, num_epochs = 1e-4, 30
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, ctx,
               d2l.split_batch_multi_inputs)

# https://d2l.ai/chapter_computer-vision/image-augmentation.html

In [None]:
# write out model artifacts
import pickle
with open('2020-1-13-bertsmall-30epochs.pkl', 'wb') as fh:
    pickle.dump(net, fh)

## Model Inference Section

In [18]:
# available models

model1 = '2020-1-13-bertsmall.pkl'
model2 = '2020-1-13-bertsmall-30epochs.pkl'
model3 = '2020-1-14-bertsmall-30epochs-trainall.pkl'
model4 = '2020-1-14-bertsmall-30epochs-train80.pkl'
model5 = '2020-1-15-bertsmall-30epochs-train80-clean.pkl'

In [19]:
import pickle

with open(model5, 'rb') as fh:
    net_test = pickle.load(fh)

In [None]:
# test_set = AMZNQABERTDataset(test_df, max_len, vocab)

# inference on dev_set

dev_set = AMZNQABERTDataset(valid_df, max_len, vocab)
dev_iter = gluon.data.DataLoader(dev_set, batch_size, num_workers=num_workers)

all_labels = list()
for X, y in dev_iter:
    inputs = [x.as_in_context(ctx[0]) for x in X]
    outputs = [i.as_in_context(ctx[0]) for i in y]
    labels = net_test(inputs)
    all_labels.append(labels.asnumpy())

label_scores = numpy.concatenate(all_labels, axis=0)
label_scores

In [None]:
predictions = [numpy.argmax(label_scores[i]) for i in range(len(label_scores))]
valid_df['scores'] = label_scores.tolist()
valid_df['predicted_label'] = predictions

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

print(" Accuracy: %f, F1: %f" % \
      (accuracy_score(valid_df['label'], valid_df['predicted_label']), f1_score(valid_df['label'], valid_df['predicted_label'])))

cm = confusion_matrix(valid_df['label'], valid_df['predicted_label'])
print(cm)

df_cm = pd.DataFrame(cm, columns=[0,1], index = [0,1])
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (2,2))
# sn.set(font_scale=1)#for label size
sn.heatmap(df_cm, cmap="Blues", annot=True)# font size

In [None]:
predictions2 = list()
for i in range(len(label_scores)):
    label = numpy.argmax(label_scores[i])
#     if abs(label_scores[i][1] - label_scores[i][0]) < 1.5:
#     if abs(label_scores[i][1]) < 0.5:
    if (label == 1) & ((label_scores[i][1]) < 0.5):
        label = 0
    predictions2.append(label)
    
valid_df['predictions2'] = predictions2

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

print(" Accuracy: %f, F1: %f" % \
      (accuracy_score(valid_df['label'], valid_df['predictions2']), f1_score(valid_df['label'], valid_df['predictions2'])))

cm = confusion_matrix(valid_df['label'], valid_df['predictions2'])
print(cm)

In [None]:
valid_df[valid_df['label']!=valid_df['predicted_label']]

### inference on final test set

In [10]:
# available retrived docs

pool1 = 'bm25_test_top_20.csv'
pool2 = 'bm25_test_top_50.csv'
pool3 = 'answerini_top_10_docs.csv'
pool4 = 'answerini_top_50_docs.csv'
pool5 = 'answerini_top_60_docs.csv'

In [86]:
inference_df = pd.read_csv(pool1, encoding='utf-8', header=0)
inference_df.head()
inference_df

Unnamed: 0,ID,query,document
0,1448,definition of a compound,"Compound Definition: A compound is a chemical species that is formed when two or more atoms join together chemically, with covalent or ionic bonds."
1,1448,definition of a compound,A chemical substance may well be defined as any material with a definite chemical composition in an introductory general chemistry textbook. According to this definition a chemical substance can either be a pure chemical element or a pure chemical compound.
2,1448,definition of a compound,"The representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit.One formula unit of an ionic compound is composed of a cation and an anion. Therefore, cations and anions are the representative particles of ionic compounds. Thus, atoms, cations, anions-all are representative particles.he representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit."
3,1448,definition of a compound,"Calcium hydroxide, informally referred to as slaked lime, is a compound created through the mixture of calcium oxide with water. The resultant substance is a white powder or crystal compound with strong alkaline properties."
4,1448,definition of a compound,A compound is a substance formed when two or more chemical elements are chemically bonded together. Two types of chemical bonds common in compounds are covalent bonds and ionic bonds. The elements in any compound are always present in fixed ratios.
...,...,...,...
49575,1809,need to check on return,"Return Your Rental: You can return your rental through Your Account within the Initial Rental Refund Period to receive a full refund. If you return your rental after the Initial Rental Refund Period, you won’t be eligible for a refund of the rental fee. Need to return a Rental after 30 days? Go to [Manage Your Rentals](https://www.amazon.com/gp/rental/your-account?ie=UTF8&ref_=ya_rentals&). To return a rental after the initial rental period: Go to [Manage Your Rentals](https://www.amazon.com/gp/rental/your-account). Select the rental item you wish to return and then select Return rental to print the pre-paid return shipping label. Print out the packaging slip and return shipping label. Package up the textbook you wish to return, including the packaging slip. Note: To avoid incorrect fees, only rentals listed on the same packing slip should be packaged and returned together. Apply the return shipping label and take the shipment to the carrier listed on your return label. Return shipping is free when you use the shipping label provided. Note: To return a rental within the Initial Rental Refund Period, go to [Manage Your Rentals](https://www.amazon.com/gp/rental/your-account?ie=UTF8&ref_=ya_rentals&). If the item is in the same condition as when you received it, you’ll receive a full refund. If the item is damaged during the rental period, additional damage fees may apply. To avoid automatic extension fees, please drop your items off with the carrier on or before the due date. For more information on Rental Return periods, visit [Rentals Terms and Conditions](www.amazon.com/gp/help/customer/display.html?nodeId=201983840)."
49576,1809,need to check on return,"If your mattress is not what was advertised, or you and your mattress are not right for each other, Amazon makes it very easy to organize a return.\n\nIf you wish to return your mattress to Amazon, please follow these steps:\n\n 1.<Log onto your Amazon account>\n 2.<Select “Your Account” and then “Your Orders”>\n 3.<Find your mattress order in the list>\n 4.<Click “Return or replace items” and follow the prompts>\n 5.<Amazon will contact you to schedule a removal and arrange return shipping. Do not worry about repacking the mattress. The Large Items team will be in touch, and they will look after the removal of the mattress.>\n\nIf you need help, the Amazon support number is 1-866-216-1072. Ask for the Large Items team."
49577,1809,need to check on return,"When you return an item, your refund and how your refund is issued may differ based on the condition of the item, how long you've had the item and how the item was purchased.\nWhere's My Refund?\n\nYou can check the payment method that was refunded and the status of your refund in [Your Orders](https://amazon.com/gp/css/order-history).\n\nNote: After the carrier has received your item, it can take up to two weeks for us to receive and process your return. Once we issue your refund, it may take additional time for your funds to be made available in your account by your financial institution."
49578,1809,need to check on return,"Before You Sign Off on Your Delivery: Before you sign off for your delivery, check for damage or for missing items. If the wrong item is delivered or it is damaged upon arrival, notify the carrier immediately and do not accept delivery. Delivery personnel will remove the item from your home and return it to Amazon, and you will be refunded the full amount."


In [87]:
inference_df['label']=0

inference_df['cleaned_document'] = inference_df.document.map(clean_text)
inference_df['cleaned_query'] = inference_df['query']
inference_df['cleaned_query'] = inference_df.cleaned_query.map(clean_text)
inference_df = inference_df[['ID', 'cleaned_query', 'cleaned_document', 'label']].copy()

In [88]:
inference_df.sample(25)

Unnamed: 0,ID,cleaned_query,cleaned_document,label
43257,1348,average lifespan of cat,"Enalapril (EnacardÂ®, VasotecÂ®) is commonly used in the both the dog and cat to treat a range of diseases, including heart and kidney disease. Please note: this article has been provided for informational purposes only.",0
35437,2349,when is the football hall of fame inductee ceremony?,"FCS stands for Football Championship Subdivision and FBS stands for Football Bowl Subdivision. Teams in the FBS, formerly known as Division 1-A, determine their champion via ratings, both human and computer.",0
13442,399,How do I fix my Fire tablet for amazon?,"Resolve Startup Issues on Your Fire Tablet Restart your device to fix a frozen or looping screen during startup. Before restarting, charge your device for at least 30 minutes with an Amazon-branded power adapter. 1.Hold the power button down for 40 seconds or until your device shuts off. If a pop-up appears with the option to restart, keep holding down the power button until it shuts off. 2.After your device is powered off, press the power button for two seconds to turn it back on. Was this information helpful?",0
30233,1215,What is an Amazon coin worth?,Clive Cussler net worth: Clive Cussler is an American adventure novelist and marine archaeologist who has a net worth of Clive Cussler net worth: Clive Cussler is an American adventure novelist and marine archaeologist who has a net worth of,0
46095,477,How do I share Kindle books with family for amazon?,"Kindle purchases are done only through one-click. You need to set this up from your account settings. You cannot add these you tour cart/basket, thats the way Amzon has build this. Its for ease of the buying. You may think twice while buying some items, but not while buying these Kindle books or anything from the Kindle store, you have already made up your mind to buy these. You can gift Kindle content to your freinds and family. They can so, add this to the wishlist. Because for Amazon, there’s no reason to delay a purchase. There is no shipping cost that needs to be calculated for ordering more than one book, so you can either buy it immediately or save it for later in a Wishlist. The shopping cart feature is really only designed for physical items. It helps Amazon keep costs down by shipping them together in one order, especially with Prime members, who get free shipping. As there’s no such savings for them with electronic items, there’s really no reason to group them.",0
14887,2258,what is zone of tolerance in biology,"Oceanic Zones There are four major oceanic zones where plants and animals live in the ocean. The four major zones are intertidal zone, neritic zone, open ocean zone and benthic zone.",0
49337,1296,Why was one of the items I ordered returned without me receiving it or asking for a return?,"About Undeliverable Packages: Occasionally packages are returned to us as undeliverable. When the carrier returns an undeliverable package to us, you will be issued a full refund (including shipping charges). Orders that are returned to us as undeliverable are not able to be re-shipped. If you would still like to purchase items that were undeliverable, you are welcome to place a new order on our website. If you suspect your order cannot be delivered as addressed and you have not received confirmation of its return or refund after 4 weeks from the estimated delivery date, please contact us url.",0
43982,698,I have not received my refund for the item that was returned.,"To_Your_Health SomethingElse Jan '18 Please read this very slowly. This will simulate me talking very slowly so that you understand. 1. They received an item in perfect condition, NOT DAMAGED Awesome. You did a great job packing up the item. 2. Reason for Return Given: No Longer Needed Hence, withhold shipping both ways. 3. I doubt the item they returned was ours originally Since you “doubt” it, you are not 100% certain. 4. I received back an item very different from what they received. Since you are not certain that it was not the same, I am assuming you mean it came back in a different condition than you sent it in. 5. The Holiday and regular return policy requires the Buyer to return the item in the same condition it was received. Since it was not returned in the same condition it was sent in, Amazon will allow you to asses up to a 50% restocking fee. So, once again… All you can do in this instance is withhold shipping both ways and assess a 50% restocking fee. Welcome to selling on Amazon.",0
4682,39,Accidentally returned item and have not received a refund,"When you return an item, your refund and how your refund is issued may differ based on the condition of the item, how long you've had the item and how the item was purchased. Where's My Refund? You can check the payment method that was refunded and the status of your refund in Your Orders url. Note: After the carrier has received your item, it can take up to two weeks for us to receive and process your return. Once we issue your refund, it may take additional time for your funds to be made available in your account by your financial institution.",0
36524,2078,what is a anemone,"The National Football League preseason is the period each year during which NFL teams play several not-for-the-record exhibition games before the actual championship or regular season starts. Beginning with the featured Pro Football Hall of Fame game in early August, five weekends of exhibition games are currently played in the NFL.",0


In [89]:
inference_df.shape

(49580, 4)

In [90]:
batch_size, max_len, num_workers = 32, 512, d2l.get_dataloader_workers()
inference_set = AMZNQABERTDataset(inference_df, max_len, vocab)
inference_iter = gluon.data.DataLoader(inference_set, batch_size, num_workers=num_workers)

read 49580 examples


In [91]:
all_labels = list()
for X, y in inference_iter:
    inputs = [x.as_in_context(ctx[0]) for x in X]
    outputs = [i.as_in_context(ctx[0]) for i in y]
    labels = net_test(inputs)
    all_labels.append(labels.asnumpy())

label_scores = numpy.concatenate(all_labels, axis=0)
label_scores

array([[ 6.1392016, -6.078507 ],
       [ 4.9978805, -5.016429 ],
       [ 6.4244905, -6.1848164],
       ...,
       [ 5.72319  , -5.035    ],
       [ 2.7339687, -1.9119759],
       [ 7.2365627, -6.9398265]], dtype=float32)

In [92]:
processing_df = inference_df.copy()
processing_df['scores'] = label_scores.tolist()

In [106]:
processing_df = inference_df.copy()
processing_df['scores'] = label_scores.tolist()


# basically compare 2 socre, and return the label associated with largest value 
predictions1 = [numpy.argmax(label_scores[i]) for i in range(len(label_scores))]

processing_df['predicted_label_base'] = predictions1
processing_df.head()

Unnamed: 0,ID,cleaned_query,cleaned_document,label,scores,predicted_label_base
0,1448,definition of a compound,"Compound Definition: A compound is a chemical species that is formed when two or more atoms join together chemically, with covalent or ionic bonds.",0,"[6.139201641082764, -6.078506946563721]",0
1,1448,definition of a compound,A chemical substance may well be defined as any material with a definite chemical composition in an introductory general chemistry textbook. According to this definition a chemical substance can either be a pure chemical element or a pure chemical compound.,0,"[4.997880458831787, -5.0164289474487305]",0
2,1448,definition of a compound,"The representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit.One formula unit of an ionic compound is composed of a cation and an anion. Therefore, cations and anions are the representative particles of ionic compounds. Thus, atoms, cations, anions-all are representative particles.he representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit.",0,"[6.424490451812744, -6.184816360473633]",0
3,1448,definition of a compound,"Calcium hydroxide, informally referred to as slaked lime, is a compound created through the mixture of calcium oxide with water. The resultant substance is a white powder or crystal compound with strong alkaline properties.",0,"[5.80796480178833, -5.599922180175781]",0
4,1448,definition of a compound,A compound is a substance formed when two or more chemical elements are chemically bonded together. Two types of chemical bonds common in compounds are covalent bonds and ionic bonds. The elements in any compound are always present in fixed ratios.,0,"[6.603011131286621, -6.480631351470947]",0


In [None]:
######################################### Amazon split #############################################################

In [None]:
test_df = pd.read_csv('test_dataset.csv', encoding='utf-8', header=0)
print(test_df.shape)
test_df.head()

In [64]:
# amazon queries

question_searchfor = ['Amazon','amazon','order','kindle', 'fire stick', 'return', 
                      'refund', 'gift', 'cancel', 'subscription', 'Orders', 'password', 
                      'Kindle', 'shipping','channels','email','Alexa','Fire TV','item', 
                      'membership','book','delivery','payment', 'reviews', 'promo', 'seller', 'REFUND'
                     'package', 'replace', 'credit card', 'ORDER', 'review', 'prime', 'package', 'showtime',
                     'Shipping', 'coupon', 'registry', 'Whole Foods', 'Replacement', 'PAYMENT', 'Prime', 'Order',
                     'charge', 'discount', 'ebt', 'EBT', 'REFUND', 'AMAZON', 'renewal', 'Package', 'claim', 'product',
                     'Refund', 'delivered', 'account', 'credit', 'Email']   


# question_searchfor = ['Amazon','amazon', 'kindle', 'fire stick',
# 'refund', 'gift', 'subscription', 'password','Orders', '\border\b'
# 'Kindle', 'shipping','channels','email','Alexa','Fire TV','item',
# 'membership','book','delivery','payment', 'reviews', 'promo', 'seller', 'REFUND'
# 'package', 'replace', 'credit card', 'review', 'prime', 'package', 'showtime',
# 'Shipping', 'coupon', 'registry', 'Whole Foods', 'Replacement', 'PAYMENT', 'Prime',
# 'charge', 'REFUND', 'AMAZON', 'renewal', 'Package',
# 'Refund', 'delivered', 'account', 'credit', 'Email']

#Amazon Related Questions
amazon_q = test_df[test_df['query'].str.contains('|'.join(question_searchfor))]
print(amazon_q.shape)
amazon_q.sample(20)

amazon_q_id = amazon_q['ID'].tolist()
len(amazon_q['ID'].tolist())


(1472, 2)


1472

In [102]:
# for amazon questions #########################################

amazon_processing_df = processing_df[processing_df['ID'].isin(amazon_q_id)]
print(amazon_processing_df.shape)

(29440, 6)


In [None]:
predictions5 = list()
for index, row in amazon_processing_df.iterrows():
    s = row['scores']
    label = numpy.argmax(s)
    if (label == 1) & ((s[1]) < 2):
        label = 0
    predictions5.append(label)
    
amazon_processing_df['predicted_label4_correction'] = predictions5

column = 'predicted_label4_correction'


amazon_processing_df[amazon_processing_df['predicted_label_base']!=amazon_processing_df[column]]
print(amazon_processing_df[amazon_processing_df['predicted_label_base']!=amazon_processing_df[column]].shape)
amazon_processing_df[amazon_processing_df['predicted_label_base']!=amazon_processing_df[column]].sample(20)

In [103]:
column = 'predicted_label_base'
amazon_result_df = amazon_processing_df.groupby(by="ID")[column].sum().reset_index()
amazon_result_df

Unnamed: 0,ID,predicted_label_base
0,1,9
1,2,2
2,4,2
3,5,11
4,6,10
...,...,...
1467,2480,7
1468,2483,11
1469,2487,15
1470,2489,4


In [104]:
amazon_result_df[amazon_result_df[column]==0]

Unnamed: 0,ID,predicted_label_base
565,584,0
611,630,0
665,685,0
679,699,0
698,719,0
791,819,0
796,824,0
806,834,0
809,837,0
909,943,0


In [105]:
amazon_result_df[column].value_counts()

20    214
19    148
5     104
6     100
7     96 
4     90 
8     86 
9     86 
3     66 
10    62 
2     60 
18    55 
12    46 
11    40 
17    40 
16    39 
1     37 
13    33 
15    29 
14    24 
0     17 
Name: predicted_label_base, dtype: int64

In [95]:
# for non amazon questions #########################################

non_amazon_processing_df = processing_df[~processing_df['ID'].isin(amazon_q_id)]
non_amazon_processing_df

Unnamed: 0,ID,cleaned_query,cleaned_document,label,scores,predicted_label_base
0,1448,definition of a compound,"Compound Definition: A compound is a chemical species that is formed when two or more atoms join together chemically, with covalent or ionic bonds.",0,"[6.139201641082764, -6.078506946563721]",0
1,1448,definition of a compound,A chemical substance may well be defined as any material with a definite chemical composition in an introductory general chemistry textbook. According to this definition a chemical substance can either be a pure chemical element or a pure chemical compound.,0,"[4.997880458831787, -5.0164289474487305]",0
2,1448,definition of a compound,"The representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit.One formula unit of an ionic compound is composed of a cation and an anion. Therefore, cations and anions are the representative particles of ionic compounds. Thus, atoms, cations, anions-all are representative particles.he representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit.",0,"[6.424490451812744, -6.184816360473633]",0
3,1448,definition of a compound,"Calcium hydroxide, informally referred to as slaked lime, is a compound created through the mixture of calcium oxide with water. The resultant substance is a white powder or crystal compound with strong alkaline properties.",0,"[5.80796480178833, -5.599922180175781]",0
4,1448,definition of a compound,A compound is a substance formed when two or more chemical elements are chemically bonded together. Two types of chemical bonds common in compounds are covalent bonds and ionic bonds. The elements in any compound are always present in fixed ratios.,0,"[6.603011131286621, -6.480631351470947]",0
...,...,...,...,...,...,...
49555,2311,what was the nickname of the day the stock market crashed,"El Cajon, CA Real Estate Insights El Cajon is a city in San Diego County, California. Nestled in a valley surrounded by mountains, the city has acquired the nickname of The Big Box.",0,"[-4.512159824371338, 4.87155818939209]",1
49556,2311,what was the nickname of the day the stock market crashed,"Live Cattle Futures Quotes Globex. Market data is delayed by at least 10 minutes. All market data contained within the CME Group website should be considered as a reference only and should not be used as validation against, nor as a complement to, real-time market data feeds.",0,"[6.227260112762451, -6.067958831787109]",0
49557,2311,what was the nickname of the day the stock market crashed,The AMEX or American Stock Exchange was founded in 1921.,0,"[6.116662502288818, -5.810803413391113]",0
49558,2311,what was the nickname of the day the stock market crashed,"For loss and deduction items, which exceed a shareholder's stock basis, the shareholder is allowed to deduct the excess up to the shareholder's basis in loans personally made to the S corporation. Debt basis is computed similarly to stock basis but there are some differences.",0,"[6.2916998863220215, -6.279568672180176]",0


In [96]:
# predictions2 = list()
# for index, row in non_amazon_processing_df.iterrows():
#     s = row['scores']
#     label = numpy.argmax(s)
#     if (label == 1) & ((s[1]) < 0.5):
#         label = 0
#     predictions2.append(label)
    
    
# non_amazon_processing_df['predicted_label1_half_correction'] = predictions2




predictions3 = list()
for index, row in non_amazon_processing_df.iterrows():
    s = row['scores']
    label = numpy.argmax(s)
    if (label == 1) & ((s[1]) < 1):
        label = 0
    predictions3.append(label)
    
non_amazon_processing_df['predicted_label1_correction'] = predictions3




# predictions4 = list()
# for index, row in non_amazon_processing_df.iterrows():
#     s = row['scores']
#     label = numpy.argmax(s)
#     if (label == 1) & ((s[1]) < 2):
#         label = 0
#     predictions4.append(label)
    
# non_amazon_processing_df['predicted_label2_correction'] = predictions4



predictions5 = list()
for index, row in non_amazon_processing_df.iterrows():
    s = row['scores']
    label = numpy.argmax(s)
    if (label == 1) & ((s[1]) < 5):
        label = 0
    predictions5.append(label)
    
non_amazon_processing_df['predicted_label4_correction'] = predictions5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [97]:
# column = 'predicted_label_base'
column = 'predicted_label1_correction'
# column = 'predicted_label2_correction'
# column = 'predicted_label4_correction'


non_amazon_processing_df[non_amazon_processing_df['predicted_label_base']!=non_amazon_processing_df[column]]
print(non_amazon_processing_df[non_amazon_processing_df['predicted_label_base']!=non_amazon_processing_df[column]].shape)
non_amazon_processing_df[non_amazon_processing_df['predicted_label_base']!=non_amazon_processing_df[column]].sample(20)
non_amazon_processing_df[(non_amazon_processing_df['predicted_label_base']==non_amazon_processing_df[column])&(non_amazon_processing_df['predicted_label_base']==1)].sample(20)

(568, 8)


Unnamed: 0,ID,cleaned_query,cleaned_document,label,scores,predicted_label_base,predicted_label1_correction,predicted_label4_correction
6541,2369,where are cranberries grown,"Cranberries offer a range of possible health benefits. A diet with a high proportion of fruits and vegetables has been shown to offer health benefits. In addition, cranberries are a good source of various vitamins and antioxidants. Historically, they have been used by Native Americans as a treatment for bladder and kidney diseases.",0,"[-4.40751838684082, 5.157879829406738]",1,1,1
24113,2284,what state is mystic seaport in?,"Cost of Attending San Francisco State University San Francisco, California In-State Tuition: $6,476 USD Out-of-State Tuition: $17,636 USD Smart Rating: 86 This page focuses on tuition and financial aid details for San Francisco State University. Head to our colleges page to see more detailed information about SF State University such as admissions and demographics, or see our Colleges topic to find the right 4-year university for you.",0,"[-3.46869158744812, 2.689633846282959]",1,1,0
28303,1600,how many military personnel are married,"EEOC Regulations require that employers keep all personnel or employment records for one year. If an employee is involuntarily terminated, his/her personnel records must be retained for one year from the date of termination.",0,"[-2.971374273300171, 3.2106380462646484]",1,1,0
47732,1630,how much is college tuition at cal state la a year,"Annual Total Cost. The total list price for in-state California residents to go to Long Beach City College is $14,182 for the 2014/2015 academic year. Out-of-state students who don't possess California residence can expect a one year cost of $19,702.he net out-of-pocket total cost you end up paying or financing though student loans is known as the net price. The reported Long Beach City College net price for in-state students $5,434* for the 2013/2014 academic year. This net price includes housing and meal expenses.",0,"[-1.5661722421646118, 1.5739619731903076]",1,1,0
49524,1687,how to repaint fridge,I've just installed my Samsung fridge-freezer. How long will it take the ice maker to make ice? Last Update date : 2014.08.05 After you have installed your refrigerator it will take 6 to 12 hours for the fridge-freezer's ice maker to produce ice.,0,"[-3.1271517276763916, 3.7769272327423096]",1,1,0
42802,1620,how much does it cost to apply to take the cpa exam,"Insurance of course will reduce the cost of the eye exam for you, so if you have this insurance it is a bonus. If you are instead looking to have a contact lenses exam at Walmart, the price is actually a bit higher than that of a regular eye exam there.ow that you have the approximate Walmart Vision Center Eye Exam Cost, head to an optometrist today and schedule an eye exam for yourself and/ or any family members.",0,"[-1.9132927656173706, 1.758052110671997]",1,1,0
47979,2382,where is 310 s peoria chicago il,"Broadview, Illinois. Broadview is a village in Cook County, Illinois, 12 miles (19 km) west of downtown Chicago. As of the 2010 census it had a population of 7,932.",0,"[-3.5653445720672607, 4.4895782470703125]",1,1,0
26394,2088,what is a jointing plane,"1. The shipper just used basic ground and they have 6-12 days to deliver it. 2. You had it shipped days before a major holiday so you competing with a million FruitCakes and Turdukens for your package priority. 3. Bad delivery drivers, mostly fedex, tend to mark your package as delivered or out for delivery and just throw it in the back of the truck till they get to your run next week. I've had FedEx mark a package as delivered on Tue. then show up Fri night at 9:30pm to actually deliver it. As for the way the logistics work FedEx, UPS, and USPS all use the same planes so if it needs to take a plane ride sometimes it has to go as close as it can get, then take a truck the rest of the way. Now if it goes by truck all the way you just gotta wait it out as it goes from hub to hub. Whats ironic is that I can mail a priority package from FL to CA and it will arrive by plane in 2-3 days. Now if I ship a package from Miami FL to Gainsville FL it takes a truck the whole way and will take 3-4 days for delivery.",0,"[-3.1617848873138428, 2.882276773452759]",1,1,0
26760,669,I have a dispute on my Credit Card Store of $1,"About Amazon Credit Builder: The Amazon.com Store Card Credit Builder and Amazon Prime Store Card Credit Builder are secured versions of the Amazon.com Store Card and Amazon Prime Store Card respectively. A secured credit card is designed for people who are looking to build or rebuild their credit and is backed by a refundable security deposit made when the account is opened. The deposit is equal to the credit limit. The Amazon.com Store Card Credit Builder and Amazon Prime Store Card Credit Builder are secured versions of the Amazon.com Store Card and Amazon Prime Store Card respectively. The Amazon Credit Builder functions and looks like the Amazon Store Card, with the exception that the credit line on the Amazon Credit Builder card will equal the amount of the refundable security deposit that you must maintain with Synchrony Bank. Continued and responsible use of the secured Amazon Credit Builder can help you build your credit history. As an Amazon Credit Builder cardholder, you will enjoy the same benefits as you would on the Amazon Store Card (promotional financing for cardholders without an eligible Prime membership, and promotional financing or 5% back every day for cardholders with an eligible Prime membership). Learn more url about upgrading to the unsecured Amazon Store Card. Building Credit with Amazon Credit Builder: Synchrony Bank will report to the major credit bureaus whether you make at least the minimum payment due on time every month. Building a history of on-time payments can help build your credit. As a cardholder, you have several tools at hand to help you learn to build your credit through responsible usage. To learn more about using credit responsibly, please visit the Credit Education url page. You can also monitor your credit score and simulate how your credit score could change based on your actions by registering for TransUnion CreditView™ url once your card is open. Difference Between Amazon Credit Builder and Debit or Prepaid Cards: Unlike a debit card or a prepaid card, a secured credit card is an actual credit card. You need to make monthly payments on your balance (ideally the full balance, but at least the minimum payment due each month) by the payment due date. Synchrony Bank reports your credit activity to the major credit bureaus-providing the opportunity to build your credit, with responsible use.",0,"[-3.568798780441284, 4.936910629272461]",1,1,0
47245,1651,how tall is prince,"Average Pomsky Size. On average, you can expect a full grown 50/50 pomsky to be about: A full grown adult pomsky is approximately 12 to 17 inches long, 10 to 15 inches tall (from floor to shoulder blade) and weighs about 20 to 30 pounds. 10 to 15 inches tall (from floor to shoulder blade) 12 to 17 inches long (from chest to rear)",0,"[-0.992820680141449, 1.3611555099487305]",1,1,0


In [98]:
non_amazon_result_df = non_amazon_processing_df.groupby(by="ID")[column].sum().reset_index()
non_amazon_result_df

Unnamed: 0,ID,predicted_label1_correction
0,3,3
1,69,5
2,167,7
3,173,4
4,182,6
...,...,...
1002,2482,6
1003,2484,1
1004,2485,3
1005,2486,1


In [99]:
non_amazon_result_df[non_amazon_result_df[column]==0]

Unnamed: 0,ID,predicted_label1_correction
35,1117,0
42,1308,0
47,1316,0
49,1323,0
94,1373,0
103,1382,0
122,1416,0
144,1439,0
147,1442,0
150,1445,0


In [100]:
non_amazon_result_df[column].value_counts()

4     180
5     161
3     154
6     118
2     114
7     71 
1     64 
8     49 
0     41 
9     28 
10    13 
11    4  
12    4  
15    2  
17    1  
13    1  
16    1  
19    1  
Name: predicted_label1_correction, dtype: int64

In [101]:
non_amazon_output_df = non_amazon_result_df.copy()
non_amazon_output_df.head()

non_amazon_output_df['results'] = non_amazon_output_df[column].apply(lambda x: 1 if x > 0 else x)

non_amazon_output_df.head()

Unnamed: 0,ID,predicted_label1_correction,results
0,3,3,1
1,69,5,1
2,167,7,1
3,173,4,1
4,182,6,1


In [None]:
####################################################################################################################

In [None]:
######################################### Original non split #############################################################

In [32]:
# basically compare 2 socre, and return the label associated with largest value 

# predictions1 = [numpy.argmax(label_scores[i]) for i in range(len(label_scores))]

# processing_df['predicted_label_base'] = predictions1





predictions2 = list()
for i in range(len(label_scores)):
    label = numpy.argmax(label_scores[i])
    if (label == 1) & ((label_scores[i][1]) < 0.5):
        label = 0
    predictions2.append(label)
    
processing_df['predicted_label1_half_correction'] = predictions2




predictions3 = list()
for i in range(len(label_scores)):
    label = numpy.argmax(label_scores[i])
    if (label == 1) & ((label_scores[i][1]) < 1):
        label = 0
    predictions3.append(label)
    
processing_df['predicted_label1_correction'] = predictions3




predictions4 = list()
for i in range(len(label_scores)):
    label = numpy.argmax(label_scores[i])
    if (label == 1) & ((label_scores[i][1]) < 2):
        label = 0
    predictions4.append(label)
    
processing_df['predicted_label2_correction'] = predictions4




predictions5 = list()
for i in range(len(label_scores)):
    label = numpy.argmax(label_scores[i])
    if (label == 1) & ((label_scores[i][1]) < 4):
        label = 0
    predictions5.append(label)
    
processing_df['predicted_label3_correction'] = predictions5




# predictions4 = list()
# for i in range(len(label_scores)):
#     label = numpy.argmax(label_scores[i])
#     if abs(label_scores[i][1] - label_scores[i][0]) < 1:
#         label = 0
#     predictions4.append(label)
    
# processing_df['predicted_label_diff1'] = predictions4


# predictions

In [33]:
processing_df.head()

Unnamed: 0,ID,cleaned_query,cleaned_document,label,scores,predicted_label_base,predicted_label1_half_correction,predicted_label1_correction,predicted_label2_correction,predicted_label3_correction
0,1448,definition of a compound,"Compound Definition: A compound is a chemical species that is formed when two or more atoms join together chemically, with covalent or ionic bonds.",0,"[6.139201641082764, -6.078506946563721]",0,0,0,0,0
1,1448,definition of a compound,A chemical substance may well be defined as any material with a definite chemical composition in an introductory general chemistry textbook. According to this definition a chemical substance can either be a pure chemical element or a pure chemical compound.,0,"[4.997880458831787, -5.0164289474487305]",0,0,0,0,0
2,1448,definition of a compound,"The representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit.One formula unit of an ionic compound is composed of a cation and an anion. Therefore, cations and anions are the representative particles of ionic compounds. Thus, atoms, cations, anions-all are representative particles.he representative particle of an element is an atom. The representative particle of a molecular compound is a molecule. The representative particle of an ionic compound is its formula unit.",0,"[6.424490451812744, -6.184816360473633]",0,0,0,0,0
3,1448,definition of a compound,"Calcium hydroxide, informally referred to as slaked lime, is a compound created through the mixture of calcium oxide with water. The resultant substance is a white powder or crystal compound with strong alkaline properties.",0,"[5.80796480178833, -5.599922180175781]",0,0,0,0,0
4,1448,definition of a compound,A compound is a substance formed when two or more chemical elements are chemically bonded together. Two types of chemical bonds common in compounds are covalent bonds and ionic bonds. The elements in any compound are always present in fixed ratios.,0,"[6.603011131286621, -6.480631351470947]",0,0,0,0,0


In [35]:
processing_df[processing_df['predicted_label_base']!=processing_df['predicted_label3_correction']]
processing_df[processing_df['predicted_label_base']!=processing_df['predicted_label3_correction']].sample(20)

Unnamed: 0,ID,cleaned_query,cleaned_document,label,scores,predicted_label_base,predicted_label1_half_correction,predicted_label1_correction,predicted_label2_correction,predicted_label3_correction
84277,2015,what credit score is a good one,"Some schools offer the ability to take single classes by paying on a per credit hour basis. The reported price per credit hour and estimated cost for one class at University of Michigan Ann Arbor are as follows. Per credit costs are for reference only as many schools will not allow students to pay by credit hour.nnual Total Cost. The total list price for in-state Michigan residents to go to U of M is $24,780 for the 2014/2015 academic year. Out-of-state students who don't possess Michigan residence can expect a one year cost of $53,200.",0,"[-1.7049560546875, 3.0945796966552734]",1,1,1,1,0
71624,1007,My daughter received a charge on her credit card from Amazon that we would like to dispute. She does not have an Amazon account nor is her card on an account.,"Yes, Amazon gift cards also expire. Each Amazon gift card is valid for 365 days from the date of the purchase. You can transfer money of your amazon card to amazon account or you can sell amazon gift card in Nigeria. These are some way to protect the money of your amazon gift cards.",0,"[-0.7601814866065979, 1.6076551675796509]",1,1,1,0,0
69222,835,I returned item\n my refund has not been processed to my gift card,"Of course. You can purchase Amazon gift cards at most of the major grocery, convenience and drug store chains and can use them, exclusively if desired, to pay for your purchases at Amazon. Its also feasible for someone to purchase and send to you a virtual Amazon gift card which can be used by you to make purchases. TurboTax, as yet another example, is partnering with Amazon to allow taxpayers to receive their 2014 tax refund in the form of a virtual Amazon gift card. Anyone taking advantage of that program will have the opportunity to buy things at Amazon without the need to link any other cards or banks accounts.",0,"[-1.8645751476287842, 3.1835851669311523]",1,1,1,1,0
52356,806,I received two packages that I didn't buy. The packages have my name and address,"Online deliveries to an apartment building in northern Manhattan are left with a retired woman in 2H who watches over her neighbors’ packages to make sure nothing gets stolen. Corporate mailrooms in New York and other cities are overwhelmed by employees shipping personal packages to work for safekeeping, leading companies to ban packages and issue warnings that boxes will be intercepted and returned to the senders. A new start-up company is gambling that online shoppers who are worried about not getting their packages will be willing to pay extra to ship them to a home-based network of package receivers in Brooklyn. With online shopping surging and another holiday season unfolding, customers’ mounting frustration and anger over stolen packages are driving many to take creative and even extreme measures to keep items out of the hands of thieves. In New York City, where more orders are delivered than anywhere else in the country, over 90,000 packages a day are stolen or disappear without explanation, up roughly 20 percent from four years ago, according to an analysis conducted for The New York Times. About 15 percent of all deliveries in urban areas fail to reach customers on the first attempt because of package theft and other issues, like deliveries to the wrong house, according to transportation experts.",0,"[0.07803308218717575, 0.9164420366287231]",1,1,0,0,0
60202,2383,where is a saddle in minecraft,"WorldCraft (84%). Worldcraft is another part of the popular Minecraft serie of online games. This world gives you a possibility build various items and shapes from cubes. In this version of the game there are not any dangerous animals so you will have sufficient calm for building anything, you can try to create calmly even a whole new world. Have fun.",0,"[-0.8183138966560364, 1.1249096393585205]",1,1,1,0,0
66963,112,Can I send someone a prepaid shipping label for amazon?,"Restrictions on Prepaid Cards: Amazon.com lets you use prepaid cards for the majority of purchases, but certain restrictions apply. The following restrictions apply to prepaid Visa, MasterCard, and American Express cards: They can't be combined with credit cards on a single order. Amazon.com doesn't support entering the three-digit CVV code normally found on the back of some cards. If the code is required by the issuing bank, payments may not process successfully. Note: Some banks may require registration of prepaid cards by name and billing address. There's a $1 authorization at the time of order to make sure that the payment method is valid. This authorization isn't a charge, but banks may hold the authorized funds as unavailable until the authorization expires.",0,"[-1.4877021312713623, 2.840219020843506]",1,1,1,1,0
25492,349,How do I change my shipping speed on Amazon?,"Enzymes are biological catalysts. Catalysts lower the activation energy for reactions. The lower the activation energy for a reaction, the faster the rate. Thus enzymes speed up reactions by lowering activation energy. Many enzymes change shape when substrates bind. This is termed induced fit, meaning that the precise orientation of the enzyme required for catalytic activity can be induced by the binding of the substrate. Enzymes have active sites.",0,"[-2.3226470947265625, 3.1702425479888916]",1,1,1,1,0
45522,1431,cost sandy hook beach,Siesta-Key.Net also has information on local attractions for Sarasota visitors. See our Community listings and send us your local group for inclusion. Siesta Key beach information is listed here. Siesta Key Florida is the home of the beach with the whitest and finest sand in the world.With information and maps to guide you to finding your place in the sun on the beach with the whitest and finest sand in the world.Check out the Siesta Key accommodation listing for the right place to stay on your next vacation.iesta-Key.Net also has information on local attractions for Sarasota visitors. See our Community listings and send us your local group for inclusion. Siesta Key beach information is listed here. Siesta Key Florida is the home of the beach with the whitest and finest sand in the world.,0,"[-2.8157522678375244, 3.040684700012207]",1,1,1,1,0
83793,752,I need to find out why there is a random $1 gift card balance in my account when I went to check out some items,"First, you will need to link your Discover credit card to your Amazon.com account. Go to the Amazon “Cashback Bonus from Discover” page url and click “Link your rewards account now.” You will see the Amazon Sign In page. Enter your Amazon account information. After you’ve signed in, you’ll be taken to the Enroll a New Card page. Click the “Enroll a New Card” button and fill out your Discover card information. Your Discover credit card is now linked to your Amazon.com account and you’ll be able to use your rewards to pay at checkout. Making a purchase at Amazon.com with the Cashback Bonus of your Discover it® card url is very easy. After you sign into your Amazon.com account, hover over “Account and Lists,” then go to “Shop with Points.” The next page will open all your accounts enrolled with Amazon.com. Click anywhere on the Discover Card line and check “Apply by Default.” You will see the words “Apply by Default” appear under your Cashback Bonus. From now on, Amazon.com will always apply your available Discover it® Cashback Bonus to pay for all or part of your order until you cancel it as the default payment method.",0,"[-1.445206880569458, 1.9637975692749023]",1,1,1,0,0
106649,1639,how much xanax will cause withdrawal,"In adults, umbilical hernia is rare. Excess of pressure put on the abdominal muscles like obesity and multiple pregnancies can cause umbilical hernia. Fluid accumulation in excess on the abdominal cavity and undergoing abdominal surgery can also cause this problem.",0,"[-3.615286111831665, 3.6878349781036377]",1,1,1,1,0


In [36]:
# column = 'predicted_label_base'
# column = 'predicted_label1_correction'
column = 'predicted_label2_correction'
column = 'predicted_label3_correction'

result_df = processing_df.groupby(by="ID")[column].sum().reset_index()
result_df

Unnamed: 0,ID,predicted_label3_correction
0,1,7
1,2,3
2,3,0
3,4,1
4,5,5
...,...,...
2474,2486,2
2475,2487,15
2476,2488,5
2477,2489,0


In [37]:
result_df[result_df[column]==0]

Unnamed: 0,ID,predicted_label3_correction
2,3,0
31,32,0
53,54,0
58,59,0
62,64,0
64,66,0
70,72,0
141,144,0
167,170,0
174,177,0


In [38]:
result_df[column].value_counts()

2     250
0     228
1     214
5     198
3     193
4     188
6     159
7     135
8     97 
9     69 
47    61 
10    56 
46    55 
48    53 
45    50 
11    40 
44    34 
39    34 
43    34 
49    33 
13    25 
12    25 
41    20 
42    19 
14    19 
16    18 
38    16 
40    16 
37    14 
35    13 
17    12 
32    11 
34    10 
15    10 
36    9  
18    8  
33    7  
31    7  
19    6  
26    5  
50    4  
29    4  
22    3  
24    3  
25    3  
28    2  
27    2  
23    2  
21    2  
30    2  
20    1  
Name: predicted_label3_correction, dtype: int64

In [None]:
######################################################################################################

In [39]:
output_df = result_df.copy()
output_df.head()

output_df['results'] = output_df[column].apply(lambda x: 1 if x > 0 else x)

output_df.head()

Unnamed: 0,ID,predicted_label3_correction,results
0,1,7,1
1,2,3,1
2,3,0,0
3,4,1,1
4,5,5,1


In [43]:
output_df[output_df['ID']==2485]

Unnamed: 0,ID,predicted_label3_correction,results
2473,2485,1,1


In [47]:
print(output_df.loc[output_df.ID.isin(amazon_q['ID'].tolist()), 'results'].shape)

output_df.loc[output_df.ID.isin(amazon_q['ID'].tolist()), 'results'].sum()

(1316,)


1207

### viewing some results

In [None]:
# processing_df
processing_df.head()

In [None]:
di_view = pd.merge(processing_df, result_df, on='ID')
di_view.head()

di_view[(di_view[column+'_y']==1) & (di_view[column+'_x']==1)]

## * final overwrite on hank amazon similar and answerable questions

In [None]:
output_df

In [78]:
hank_df = pd.read_csv('hank-submission2.txt', encoding='utf-8', header=0)
hank_df.head()


hank_df[(hank_df.ID.isin(amazon_q['ID'].tolist())) & (hank_df['label'] ==0)]

hank_df[(hank_df.ID.isin(amazon_q['ID'].tolist()))]

Unnamed: 0,ID,label
1,1199,1
2,530,1
5,1716,1
6,944,1
7,249,1
...,...,...
2471,796,1
2472,1166,1
2474,43,1
2475,583,1


In [None]:
test_df = pd.read_csv('test_dataset.csv', encoding='utf-8', header=0)
print(test_df.shape)
test_df.head()

In [None]:
# amazon queries

question_searchfor = ['Amazon','amazon','order','kindle', 'fire stick', 'return', 
                      'refund', 'gift', 'cancel', 'subscription', 'Orders', 'password', 
                      'Kindle', 'shipping','channels','email','Alexa','Fire TV','item', 
                      'membership','book','delivery','payment', 'reviews', 'promo', 'seller', 'REFUND'
                     'package', 'replace', 'credit card', 'ORDER', 'review', 'prime', 'package', 'showtime',
                     'Shipping', 'coupon', 'registry', 'Whole Foods', 'Replacement', 'PAYMENT', 'Prime', 'Order',
                     'charge', 'discount', 'ebt', 'EBT', 'REFUND', 'AMAZON', 'renewal', 'Package', 'claim', 'product',
                     'Refund', 'delivered', 'account', 'credit', 'Email']   



In [40]:
#Amazon Related Questions
amazon_q = test_df[test_df['query'].str.contains('|'.join(question_searchfor))]
print(amazon_q.shape)
amazon_q.sample(20)

NameError: name 'question_searchfor' is not defined

In [None]:
amazon_q_joined = pd.merge(amazon_q, hank_df,on='ID')
amazon_q_joined
amazon_q_joined['label'].sum() # 1472

In [None]:
output_df[output_df['ID'].isin(amazon_q['ID'].tolist())]['results'].sum() # 1467


# overwrite amazon question with label 1

# output_df[output_df['ID'].isin(amazon_q['ID'].tolist())].loc[:,'results'] = 1

output_df.loc[output_df.ID.isin(amazon_q['ID'].tolist()), 'results'] = 1

In [None]:
output_df[output_df['ID'].isin(amazon_q['ID'].tolist())]['results'].sum()

In [None]:
output_df.shape, output_df['results'].sum()

### write out final result

In [79]:
# with amazon question all 1s

na = non_amazon_output_df[['ID', 'results']]

# a = pd.DataFrame()
# a['ID'] = amazon_q['ID']
# a['results'] = 1
# a


a = pd.DataFrame()
a['ID'] = hank_df[(hank_df.ID.isin(amazon_q['ID'].tolist()))]['ID']
a['results'] = hank_df[(hank_df.ID.isin(amazon_q['ID'].tolist()))]['label']
a

Unnamed: 0,ID,results
1,1199,1
2,530,1
5,1716,1
6,944,1
7,249,1
...,...,...
2471,796,1
2472,1166,1
2474,43,1
2475,583,1


In [80]:
_submission = pd.concat([na, a])
print(_submission.shape)
print(_submission['results'].sum())
_submission.head()

(2479, 2)
2462


Unnamed: 0,ID,results
0,3,1
1,69,1
2,167,1
3,173,1
4,182,1


In [81]:
submission = pd.DataFrame()
submission["ID"] = _submission["ID"]
submission["label"] = _submission["results"]

submission.to_csv("hackathon_result_cleaned_split_correction1.csv", encoding='utf-8', index=False)

In [48]:
# original output
submission = pd.DataFrame()
submission["ID"] = output_df["ID"]
submission["label"] = output_df["results"]

submission.to_csv("hackathon_result_cleaned.csv", encoding='utf-8', index=False)