In [1]:
import os
import string
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(color_codes=True)
sns.set_style("white")

from plotly.offline import plot
import plotly.graph_objs as go

import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

import re
import nltk
from nltk.corpus import stopwords
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.porter import *
from string import punctuation

In [2]:
stops = set(stopwords.words("english"))
stemmer = PorterStemmer()
TARGET = 'is_duplicate'
ID = 'id'
NUM_PLACEHOLDER = 'NUMBER'

metrics = list()

In [3]:
def find_unigrams(question):
    word_tokens = question.split()
    word_tokens = [w for w in word_tokens if not w in stops]
    word_tokens = list(map(str.lower, word_tokens))
    return word_tokens

def get_word_frequencies(row):
    return dict(Counter(row))

def get_word_freq_proportions(row):
    row_sum = sum(row.values())
    row_dict_proportion = {a : (b / row_sum) for a, b in row.items()}
    return row_dict_proportion

def get_intersecting_words(row):
    q1_dict = row['q1_freq_dict']
    q2_dict = row['q2_freq_dict']
    
    intersecting_words = {a : q1_dict[a] + q2_dict[a] for a in q1_dict.keys() if a in q2_dict.keys()}
    return intersecting_words

In [4]:
train_set = pd.read_csv(os.getcwd() + '/input/train.csv')
test_set = pd.read_csv(os.getcwd() + '/input/test.csv')

train_set['question1'] = train_set['question1'].fillna('')
train_set['question2'] = train_set['question2'].fillna('')
test_set['question1']  = test_set['question1'].fillna('')
test_set['question2']  = test_set['question2'].fillna('')

In [5]:
train_copy = train_set.copy()

In [6]:
def clean_text(text):
    text = re.sub(r"(What|what|Where|there)\'s", r'\1 is', text)
    text = re.sub(r'(I|i)\'m', r'\1 am', text)
    text = re.sub(r"(do|are|have|did|does|was|is) not", r"\1n't", text)
    text = re.sub(r"can not", r"can't'", text)
    text = re.sub(r"will not", r"won't", text)
    text = re.sub(r"I\'ll", r"I will", text)
    
    text = re.sub(r"(demon)(i)(tization)", r"\1e\3",text)
    text = re.sub("de-monetize", "demonetize",text)
    text = re.sub("19-year-old", "19 year old", text)
    text = re.sub("I\'ve", "I have", text)
    text = re.sub("you\'ve", "you have", text)
    text = re.sub("'re", " are", text)
    text = re.sub("&", "and", text)
    text = re.sub("\s\.\s", "\.", text)
    text = re.sub("“|”", "\"", text)
    
    #Replace multiple occurrences of spaces with a single space
    text = re.sub(r"(\w+)(\s{2})(\w+)", r"\1 \3", text)
    text = re.sub(r"(\w+)(\s{3})(\w+)", r"\1 \3", text)
    text = re.sub(r"(\w+)(\s{2})(\d+)", r"\1 \3", text)
    text = re.sub(r"(\d+)(\s{2})(\w+)", r"\1 \3", text)
    text = re.sub(r"(\.)(\s{2})(\w+)", r"\1 \3", text)
    text = re.sub(r"(\w+)(\s{2})(\.)", r"\1 \3", text)
    text = re.sub(r"(:\)|:\()", r"", text)

    text = re.sub("rupees\snotes|rupees\snote|rupees\scurrency\snotes|rupee\snotes|currency\snotes|Rs notes", "notes", text)
    text = re.sub("currency\snote|rs\snotes|rupee note", "notes", text)
    text = re.sub(r"(Rs\s+)(\d0{2}\sand\s)(Rs\s+)(\d0{3})(\snotes)", r"\2\4\5", text)
    text = re.sub(r"(Rs.\s+)(\d0{2}\sand\s)(Rs.\s+)(\d0{3})(\snotes)", r"\2\4\5", text)
    
    #Replace number abbreviations
    text = re.sub(r"(\d+)(k\s)", r"\g<1>000\s", text)
    return text

In [7]:
train_copy['q1_clean']   = train_copy['question1'].apply(lambda x: clean_text(x))
train_copy['q2_clean']   = train_copy['question2'].apply(lambda x: clean_text(x))

In [8]:
res = train_copy[train_copy['question1'].str.contains(r"\d0{2}\sand\s\d0{3}")]
res.shape

(2003, 8)

In [9]:
res2 = train_copy[train_copy['question1'].str.contains("\.com")]
res2.shape

(1714, 8)

In [10]:
idx = np.random.choice(res2.index.values)
print(idx)
print(train_copy.loc[idx]['question1'])
print(train_copy.loc[idx]['q1_clean'])

print("----------------------------------------------")
print(train_copy.loc[idx]['question2'])
print(train_copy.loc[idx]['q2_clean'])

382311
How profitable is wearhop.com?
How profitable is wearhop.com?
----------------------------------------------
Is NakedNews.com profitable?
Is NakedNews.com profitable?


In [11]:
train_copy['q1_demonetize']    = train_copy['q1_clean'].str.contains("\d0{2}\sand\s\d0{3}\snotes")
train_copy['q2_demonetize']    = train_copy['q2_clean'].str.contains("\d0{2}\sand\s\d0{3}\snotes")
train_copy['q1_demonetize']    = train_copy['q1_demonetize'].map({True: 1, False: 0})
train_copy['q2_demonetize']    = train_copy['q2_demonetize'].map({True: 1, False: 0})

In [12]:
train_copy['q1_tokens']      = train_copy['q1_clean'].apply(lambda x: find_unigrams(x))
train_copy['q2_tokens']      = train_copy['q2_clean'].apply(lambda x: find_unigrams(x))
train_copy['q1_len']         = train_copy['q1_tokens'].apply(len)
train_copy['q2_len']         = train_copy['q2_tokens'].apply(len)
train_copy['q1_q2_len_diff'] = abs(train_copy['q1_len'] - train_copy['q2_len'])

In [13]:
train_copy['q1_freq_dict'] = train_copy['q1_tokens'].apply(lambda row: get_word_frequencies(row))
train_copy['q2_freq_dict'] = train_copy['q2_tokens'].apply(lambda row: get_word_frequencies(row))

In [14]:
train_copy['intersecting_words']        = train_copy.apply(lambda row: get_intersecting_words(row), axis=1)
train_copy['intersecting_word_counts']  = train_copy.apply(lambda x: len(x['intersecting_words']), axis=1)
train_copy['intersecting_word_prop']    = train_copy['intersecting_word_counts'] / (train_copy['q1_len'] + train_copy['q2_len'])

In [15]:
idx = np.random.choice(train_copy.index.values)
print(idx)
   
print(train_copy.iloc[idx]['q1_freq_dict'])
print("=============================================")
print(train_copy.iloc[idx]['q2_freq_dict'])
print("=============================================")
print(train_copy.iloc[idx]['intersecting_words'])
print(train_copy.iloc[idx]['intersecting_word_prop'])

160803
{'red': 1, 'blue': 2, '20': 1, 'must': 1, 'yellow': 1, 'how': 1, 'identical': 1, 'bowls?': 1, 'red,': 1, 'bowls': 1, 'ways': 1, 'pots': 1, 'many': 1, 'painting': 1}
{'red': 1, 'blue': 2, 'painting': 1, 'must': 1, 'yellow': 1, 'how': 1, 'identical': 1, 'bowls?': 1, 'red,': 1, 'bowls': 2, 'ways': 1, 'many': 1, '20': 1}
{'red': 2, 'blue': 4, '20': 2, 'must': 2, 'yellow': 2, 'how': 2, 'identical': 2, 'bowls?': 2, 'red,': 2, 'bowls': 3, 'ways': 2, 'many': 2, 'painting': 2}
0.433333333333


In [16]:
train_corpus  = pd.DataFrame(pd.Series(train_set['question1'].tolist() + train_set['question2'].tolist()))
train_corpus.drop_duplicates(inplace=True)
train_corpus_series = pd.Series(train_corpus.iloc[:, 0].index, index=train_corpus.iloc[:, 0].values)

In [17]:
train_copy['q1_index'] = train_copy['question1'].map(train_corpus_series.to_dict())
train_copy['q2_index'] = train_copy['question2'].map(train_corpus_series.to_dict())

q1_vc = train_copy['q1_index'].value_counts()
q2_vc = train_copy['q2_index'].value_counts()

train_copy['q1_freq'] = train_copy['q1_index'].map(q1_vc.to_dict())
train_copy['q2_freq'] = train_copy['q2_index'].map(q2_vc.to_dict())

In [18]:
X = train_copy[['intersecting_word_prop', 'q1_demonetize', 'q1_freq']]
y = train_copy[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = RandomForestClassifier(random_state=0)
clf = clf.fit(X_train, y_train)

y_proba = clf.predict_proba(X_test)
log_loss_score = log_loss(y_test, y_proba)
metrics.append(log_loss_score)

In [19]:
metrics

[0.45607551127908924]

In [20]:
# bow_transformer = CountVectorizer(analyzer='word', ngram_range=(1,2), max_features=256, stop_words='english')
# bow_matrix =  bow_transformer.fit_transform(train_corpus)

# vocab = bow_transformer.get_feature_names()
# bow_matrix = bow_matrix.toarray()

# document_freq = np.sum(bow_matrix, axis=1)
# word_count = np.sum(bow_matrix, axis=0)

# words_count = pd.DataFrame(word_count, columns=['freq'], index=vocab)
# words_count = words_count.sort_values('freq', ascending=False)

In [21]:
# tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df = 0, max_features=256, stop_words='english')
# tfidf_matrix =  tf.fit_transform(train_corpus)
# dense = tfidf_matrix.todense()
# feature_names = tf.get_feature_names()

In [22]:
# episode = dense[0].tolist()[0]
# phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0]
# phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)


# for phrase, score in [(feature_names[word_id], score) for (word_id, score) in phrase_scores]:
#    print("{0:<20} {1}".format(phrase, score))

In [23]:
# gb_qid  = train_set.groupby('qid1').filter(lambda x: len(x) > 1).groupby('qid1')
# duplicate_qid1 = sorted(list(gb_qid.groups))
# len(duplicate_qid1)

In [24]:
# stats = gb_qid['is_duplicate'].agg({np.sum, np.size})
# only_duplicates = stats.loc[stats['sum'] == stats['size']].sort_values(['size'], ascending=False)
# duplicate_df = train_set.loc[train_set['qid1'].isin(only_duplicates.index)]

In [25]:
# train_set.loc[train_set['qid1'].isin(duplicate_qid1), 'graph_root'] = 1

# train_set['graph_root'].fillna(0, inplace=True)
# train_set['graph_root'] = train_set['graph_root'].astype(int)

In [26]:
# for node in only_duplicates.index:
#     group = train_set.loc[train_set['qid1'] == node]
#     group1 = train_set.loc[train_set['qid1'].isin(group['qid2'])]
    
#     if len(group1) > 0:
#         train_set.loc[train_set['qid1'] == node, 'neighbors'] = len(group1)
        
# train_set['neighbors'].fillna(0, inplace=True)
# train_set['neighbors'] = train_set['neighbors'].astype(int)

In [27]:
#from nltk.book import *