In [1]:
from gensim import corpora
from gensim.summarization import bm25
from gensim.summarization.bm25 import get_bm25_weights
import pandas as pd
import re
import matplotlib.pyplot as plt
from rank_bm25 import BM25Okapi
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### training data

In [2]:
df = pd.read_csv("train_dataset.csv")

In [3]:
df.columns = ["query", "document", "label"]

In [4]:
len(np.unique(df["query"].values))

11000

In [5]:
pd.DataFrame(df.groupby(["query"]).size()).sort_values(0, ascending=False)

Unnamed: 0_level_0,0
query,Unnamed: 1_level_1
I need help with an order please,10
"Hi, • i change the time of an amazon fresh •?",10
My order • telling me it’s undeliverable. I want to cancel it,10
I made a gift contribution on the wrong one and need to get a refund,10
Why is my refund a gift certificate?,10
I can’t cancel,10
Regarding a replacement return.,10
Are you saying I have to cancel and •?,10
return and reund,10
Was i refunded for this item,10


In [6]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
REPLACE_BY_SPACE = re.compile('[(){}\[\]\|@,;]')
BAD_SYMBOLS = re.compile('[^a-zA-Z]')
STOPWORDS  =  set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amznyc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def clean_text(sample):
    
    """Remove URLs from a sample string"""
    sample = re.sub(r"\(http\S+\)", " <url>", sample)
    sample = re.sub(r"\(www\S+\)", " <url>", sample)
    
    """Remove \n from a sample string"""
    sample = re.sub(r'\n', ' ', sample)
    
    if sample == None:
        return None
    if type(sample) == float:
        return None
    sample = sample.lower() 
    sample = REPLACE_BY_SPACE.sub(' ', sample) 
    sample = BAD_SYMBOLS.sub(' ', sample)
    sample = ' '.join(word for word in sample.split() if word not in STOPWORDS)
    
    return sample

#### Use only unique documents

In [8]:
unique_document_df = df[["document"]].drop_duplicates().reset_index(drop=True).reset_index()
unique_document_df.columns = ["doc_num", "document"]

In [9]:
unique_document_df

Unnamed: 0,doc_num,document
0,0,Cancel an Accidental Purchase: You are able to...
1,1,Verify Continuing Student Status: If you're a ...
2,2,Returning Gifts from Your Baby Registry: The e...
3,3,Connect Your Fire Tablet to Wi-Fi\n\nAccess Wi...
4,4,Protect Your System: Some suspicious emails co...
5,5,About Prime Gift Membership Cancellations: If ...
6,6,Track Your Return: You can stay on top of your...
7,7,Customer Return Policy for Kenmore Major Appli...
8,8,Upgrade to 5% Back on Amazon Prime Rewards Vis...
9,9,Return Items You Ordered: You can return many ...


In [10]:
unique_document_map = unique_document_df.set_index("document").to_dict()["doc_num"]
unique_document_map

{"Cancel an Accidental Purchase: You are able to return a Prime Video order within 48 hours if you haven't attempted to watch or download it. Need to cancel an accidental Prime Video order? [Go to Your Orders](https://www.amazon.com/your-orders). Video: Cancel an Accidental Order. To return an accidental or unwanted Prime Video purchase: Go to [Your Orders](https://www.amazon.com/your-orders). Find the accidental order you’d like to cancel. Select Cancel Your Order. Choose a reason for the cancellation from the drop-down menu, and then click Cancel This Purchase. After the cancellation is complete, your refund is issued to the payment method used for the order. The video is also removed from Your Video Library as a part of the refund process. Note: If you have paid through Apple, you must browse to the relevant product page on the website or Prime Video app to see your cancellation options. Related Help Topics [Cancel Your Prime Video Channel Subscription](www.amazon.com/gp/help/custom

In [11]:
unique_document_df["cleaned_document"] = unique_document_df["document"].apply(clean_text)
unique_document_df["cleaned_document_list"] = unique_document_df["cleaned_document"].apply(lambda x: x.split(" "))
"""
texts = np.unique(unique_document_df["cleaned_document_list"].values)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
bm25_obj = bm25.BM25(corpus)
"""

tokenized_corpus = list(unique_document_df["cleaned_document_list"].values)

bm25_obj = BM25Okapi(tokenized_corpus)

In [12]:
unique_document_df

Unnamed: 0,doc_num,document,cleaned_document,cleaned_document_list
0,0,Cancel an Accidental Purchase: You are able to...,cancel accidental purchase able return prime v...,"[cancel, accidental, purchase, able, return, p..."
1,1,Verify Continuing Student Status: If you're a ...,verify continuing student status student recei...,"[verify, continuing, student, status, student,..."
2,2,Returning Gifts from Your Baby Registry: The e...,returning gifts baby registry easiest way retu...,"[returning, gifts, baby, registry, easiest, wa..."
3,3,Connect Your Fire Tablet to Wi-Fi\n\nAccess Wi...,connect fire tablet wi fi access wi fi followi...,"[connect, fire, tablet, wi, fi, access, wi, fi..."
4,4,Protect Your System: Some suspicious emails co...,protect system suspicious emails contain attac...,"[protect, system, suspicious, emails, contain,..."
5,5,About Prime Gift Membership Cancellations: If ...,prime gift membership cancellations need cance...,"[prime, gift, membership, cancellations, need,..."
6,6,Track Your Return: You can stay on top of your...,track return stay top returns tracking orders ...,"[track, return, stay, top, returns, tracking, ..."
7,7,Customer Return Policy for Kenmore Major Appli...,customer return policy kenmore major appliance...,"[customer, return, policy, kenmore, major, app..."
8,8,Upgrade to 5% Back on Amazon Prime Rewards Vis...,upgrade back amazon prime rewards visa signatu...,"[upgrade, back, amazon, prime, rewards, visa, ..."
9,9,Return Items You Ordered: You can return many ...,return items ordered return many items sold am...,"[return, items, ordered, return, many, items, ..."


### Map the document number to the training data

In [13]:
df["doc_num"] = df["document"].apply(lambda x: unique_document_map[x])
df

Unnamed: 0,query,document,label,doc_num
0,Cancel digital order I didn’t order.,Cancel an Accidental Purchase: You are able to...,0,0
1,NEED TO KNOW THE STATUS OF RETURNED ITEM,Verify Continuing Student Status: If you're a ...,0,1
2,I received a gift off my registry and it was s...,Returning Gifts from Your Baby Registry: The e...,0,2
3,How do I get internet on Amazon Fire?,Connect Your Fire Tablet to Wi-Fi\n\nAccess Wi...,1,3
4,I received an email saying my account info has...,Protect Your System: Some suspicious emails co...,0,4
5,Why was my most recent order canceled?,About Prime Gift Membership Cancellations: If ...,0,5
6,Im trying to locate my package,Track Your Return: You can stay on top of your...,0,6
7,"I never received the first order, thats why i ...",Customer Return Policy for Kenmore Major Appli...,0,7
8,ASK ABOUT THE VISA CARD,Upgrade to 5% Back on Amazon Prime Rewards Vis...,1,8
9,How am I supposed to return the package if I h...,Return Items You Ordered: You can return many ...,1,9


### Define a function to get query score and top n best doc

In [14]:
def get_query_score_and_top_n_best_doc(query):
    query = clean_text(query)
    #query_doc = dictionary.doc2bow(query.split())
    #scores = bm25_obj.get_scores(query_doc)
    tokenized_query = query.split(" ")
    scores = bm25_obj.get_scores(tokenized_query)
    best_docs = sorted(unique_document_df["doc_num"].values, key=lambda i: scores[i])[-TOP_N:][::-1]
    return best_docs, scores

In [17]:
def doc_num_is_in_top_n_docs(doc_num, query_best_docs):
    if doc_num in query_best_docs:
        return 1
    else:
        return 0

In [18]:
def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if (a_set & b_set): 
        return 1 
    else: 
        return 0


In [19]:
is_among_best_docs_list = []
is_among_best_docs_new_func_list = []

In [22]:
for i in range(30, 55, 5):
    TOP_N = i
    positive_train_df = df[df.label==1]
    positive_train_df["query_best_docs"] = positive_train_df["query"].apply(lambda x: get_query_score_and_top_n_best_doc(x)[0])
    positive_train_df["query_doc_score"] = positive_train_df["query"].apply(lambda x: get_query_score_and_top_n_best_doc(x)[1])
    positive_train_df["is_among_best_docs"] = positive_train_df[["doc_num", "query_best_docs"]].apply(lambda x: doc_num_is_in_top_n_docs(*x), axis=1)
    is_among_best_docs_list.append(pd.DataFrame(positive_train_df.groupby(["label", "is_among_best_docs"]).size()))
    query_matched_doc_df = pd.DataFrame(df[df.label==1].groupby("query")["doc_num"].apply(list)).reset_index()
    query_matched_doc_df.columns = ["query", "query_matched_doc_original"]
    positive_train_df = pd.merge(positive_train_df, query_matched_doc_df, on="query")
    positive_train_df["is_among_best_docs_new_func"] = positive_train_df[["query_best_docs", "query_matched_doc_original"]].apply(lambda x: common_member(*x), axis=1)
    is_among_best_docs_new_func_list.append(pd.DataFrame(positive_train_df.groupby(["label", "is_among_best_docs_new_func"]).size()))
    print(is_among_best_docs_new_func_list)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[                                      0
label is_among_best_docs_new_func      
1     0                            3018
      1                            8874,                                       0
label is_among_best_docs_new_func      
1     0                            2706
      1                            9186]
[                                      0
label is_among_best_docs_new_func      
1     0                            3018
      1                            8874,                                       0
label is_among_best_docs_new_func      
1     0                            2706
      1                            9186,                                       0
label is_among_best_docs_new_func      
1     0                            2406
      1                            9486]
[                                      0
label is_among_best_docs_new_func      
1     0                            3018
      1                            8874,                                

In [25]:
for i in is_among_best_docs_new_func_list:
    tmp = pd.DataFrame(i)
    tmp[tmp.is_among_best_docs_new_func==1]

In [26]:
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,0
label,is_among_best_docs_new_func,Unnamed: 2_level_1
1,0,1930
1,1,9962


In [27]:
9962/(9962+1930)

0.8377060208543559

In [31]:

TOP_N = 100
positive_train_df = df[df.label==1]
positive_train_df["query_best_docs"] = positive_train_df["query"].apply(lambda x: get_query_score_and_top_n_best_doc(x)[0])
positive_train_df["query_doc_score"] = positive_train_df["query"].apply(lambda x: get_query_score_and_top_n_best_doc(x)[1])
positive_train_df["is_among_best_docs"] = positive_train_df[["doc_num", "query_best_docs"]].apply(lambda x: doc_num_is_in_top_n_docs(*x), axis=1)
query_matched_doc_df = pd.DataFrame(df[df.label==1].groupby("query")["doc_num"].apply(list)).reset_index()
query_matched_doc_df.columns = ["query", "query_matched_doc_original"]
positive_train_df = pd.merge(positive_train_df, query_matched_doc_df, on="query")
positive_train_df["is_among_best_docs_new_func"] = positive_train_df[["query_best_docs", "query_matched_doc_original"]].apply(lambda x: common_member(*x), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
print(pd.DataFrame(positive_train_df.groupby(["label", "is_among_best_docs_new_func"]).size()))

                                       0
label is_among_best_docs_new_func       
1     0                             1169
      1                            10723


In [34]:
10723/(10723+1169)

0.901698620921628

In [29]:
is_among_best_docs_list[-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
label,is_among_best_docs,Unnamed: 2_level_1
1,0,2129
1,1,9763


In [30]:
9763/(9962+1930)

0.8209720820719811