In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import isnan, when, count, length, lit, udf, col, struct
import numpy as np
from pyspark.ml.feature import IDF, Tokenizer, CountVectorizer

In [2]:
spark = SparkSession.builder.getOrCreate()
spark

In [3]:
# set files as train and test

# sample sets for local computer
trainFileName = "./data/quora_train_1000.csv"
testFileName = "./data/quora_test_1000.csv"

# large sets for RCC
#trainFileName = "data/quora_train.csv"
#testFileName = "data/quora_test.csv"

In [4]:
# read in training set, drop na's, return count and head
sch = StructType([StructField('id',IntegerType()), \
                  StructField('qid1',IntegerType()),\
                  StructField('qid2',IntegerType()), \
                  StructField('question1',StringType()),\
                  StructField('question2',StringType()), \
                  StructField('is_duplicate',IntegerType())])
train = spark.read.csv(trainFileName, header=True, escape='"', 
                       quote='"',schema=sch, multiLine = True)
train = train.dropna()

train.cache()
print('Number of rows = %s' % train.count())
train.show(6)

Number of rows = 1000
+---+----+----+--------------------+--------------------+------------+
| id|qid1|qid2|           question1|           question2|is_duplicate|
+---+----+----+--------------------+--------------------+------------+
|  0|   1|   2|What is the step ...|What is the step ...|           0|
|  1|   3|   4|What is the story...|What would happen...|           0|
|  2|   5|   6|How can I increas...|How can Internet ...|           0|
|  3|   7|   8|Why am I mentally...|Find the remainde...|           0|
|  4|   9|  10|Which one dissolv...|Which fish would ...|           0|
|  5|  11|  12|Astrology: I am a...|I'm a triple Capr...|           1|
+---+----+----+--------------------+--------------------+------------+
only showing top 6 rows



In [5]:
# read in test set, drop nas, return count and head
test = spark.read.csv(testFileName, header=True, escape='"', \
                            encoding='utf8', multiLine = True)
test = test.dropna()
test.cache()
print('Number of rows = %s' % test.count())
test.show(6)

Number of rows = 1000
+-------+--------------------+--------------------+
|test_id|           question2|           question1|
+-------+--------------------+--------------------+
|      0|How do I show tha...|Emoticons: What g...|
|      1|What is the scope...|Does ECE have a s...|
|      2|What was the orig...|Why do prosecuted...|
|      3|How  can someone ...|How do I grow tal...|
|      4|Can weapons to pa...|What is the bigge...|
|      5|What is the if I ...|What happens when...|
+-------+--------------------+--------------------+
only showing top 6 rows



In [6]:
#Drop unnecessary columns from train
train = train.drop('qid1', 'qid2')
#Create dataframe `test` with new column `id`
maxTrainID = train.groupBy().max('id').collect()[0][0]
test = test.withColumn("id",(test.test_id+maxTrainID+1).cast("integer")).drop('test_id')
test.show(5)

+--------------------+--------------------+----+
|           question2|           question1|  id|
+--------------------+--------------------+----+
|How do I show tha...|Emoticons: What g...|1000|
|What is the scope...|Does ECE have a s...|1001|
|What was the orig...|Why do prosecuted...|1002|
|How  can someone ...|How do I grow tal...|1003|
|Can weapons to pa...|What is the bigge...|1004|
+--------------------+--------------------+----+
only showing top 5 rows



In [7]:
# Add column is duplicate with -1 as a value for test
test = test.withColumn('is_duplicate', lit(-1))
test.show(6)

+--------------------+--------------------+----+------------+
|           question2|           question1|  id|is_duplicate|
+--------------------+--------------------+----+------------+
|How do I show tha...|Emoticons: What g...|1000|          -1|
|What is the scope...|Does ECE have a s...|1001|          -1|
|What was the orig...|Why do prosecuted...|1002|          -1|
|How  can someone ...|How do I grow tal...|1003|          -1|
|Can weapons to pa...|What is the bigge...|1004|          -1|
|What is the if I ...|What happens when...|1005|          -1|
+--------------------+--------------------+----+------------+
only showing top 6 rows



In [8]:
# join the train and test:  data
data = train.union(test.select(train.columns))
print('Number of rows = %s' % data.count())
data.filter(data.id > 996).show(6)

Number of rows = 2000
+----+--------------------+--------------------+------------+
|  id|           question1|           question2|is_duplicate|
+----+--------------------+--------------------+------------+
| 997|I and my girlfrie...|Why most of the c...|           0|
| 998|Could we use cher...|Can we map the su...|           1|
| 999|What is a good so...|Diving the Blue H...|           0|
|1000|Emoticons: What g...|How do I show tha...|          -1|
|1001|Does ECE have a s...|What is the scope...|          -1|
|1002|Why do prosecuted...|What was the orig...|          -1|
+----+--------------------+--------------------+------------+
only showing top 6 rows



In [9]:
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\JohntheGreat\AppData\Roaming\nltk_data..
[nltk_data]    |     .
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\JohntheGreat\AppData\Roaming\nltk_data..
[nltk_data]    |     .
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\JohntheGreat\AppData\Roaming\nltk_data..
[nltk_data]    |     .
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\JohntheGreat\AppData\Roaming\nltk_data..
[nltk_data]    |     .
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\JohntheGreat\AppData\Roaming\nltk_data..
[nltk_

True

In [10]:
# Define stop words
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:15])
stop_words = set(stop_words)
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him']


In [11]:
# lemmas_nltk and udf

def lemmas_nltk(s):
    return " ".join([wordnet_lemmatizer.lemmatize(wordnet_lemmatizer.lemmatize(w,'n'),'v')
                     for w in s.lower().split() if w.isalpha() & (not w in stop_words)])
lemmas_nltk_udf = udf(lemmas_nltk, StringType())

In [12]:
# wordcount function and udf
def wordsCount(str): return str.count(' ')+1
wordsCount_udf = udf(wordsCount, IntegerType())

In [13]:
# ratio function and udf
def ratio(x,y): return abs(x-y)/(x+y+1e-15) ############## divide by zero occures!!! ###########################
ratio_udf = udf(ratio, DoubleType())

In [14]:
# Common ngrams function and udf
import re
regex = re.compile('([^\s\w]|_)+')
def commonNgrams(s1, s2, n):
    return len(set(nltk.ngrams(regex.sub('', s1).lower().split(), n))
                & set(nltk.ngrams(regex.sub('', s2).lower().split(), n)) )
commonNgrams_udf = udf(commonNgrams, IntegerType())

In [15]:
#unigram_ratio function and udf
def unigram_ratio(ngrams, n1, n2):
    return ngrams/(1+max(n1, n2))
unigram_ratio_udf = udf(unigram_ratio, DoubleType())


In [16]:
# Squared distance function
def tfidfDist(a,b): return float(a.squared_distance(b))
dist_udf = udf(tfidfDist, DoubleType())

In [17]:
#Project description
#For the first part of this project create the following set of features:
featureNames = ['lWCount1', 'lWCount2',
                'qWCount1', 'qWCount2',
                'lLen1', 'lLen2',
                'qLen1', 'qLen2',
                'lWCount_ratio', 'qWCount_ratio',
                'lLen_ratio', 'qLen_ratio',
                'qNgrams_1', 'qNgrams_2', 'qNgrams_3', 
                'lNgrams_1', 'lNgrams_2', 'lNgrams_3', 
                'qUnigram_ratio', 'lUnigram_ratio', 
                'tfidfDistance']

In [18]:
# show starting data frame
data.show(6)

+---+--------------------+--------------------+------------+
| id|           question1|           question2|is_duplicate|
+---+--------------------+--------------------+------------+
|  0|What is the step ...|What is the step ...|           0|
|  1|What is the story...|What would happen...|           0|
|  2|How can I increas...|How can Internet ...|           0|
|  3|Why am I mentally...|Find the remainde...|           0|
|  4|Which one dissolv...|Which fish would ...|           0|
|  5|Astrology: I am a...|I'm a triple Capr...|           1|
+---+--------------------+--------------------+------------+
only showing top 6 rows



In [20]:
# lemmatization
for i in ["1","2"]:
    data = data.withColumn('lemma'+i, lemmas_nltk_udf(data["question"+i]))

data.select('id','lemma1','lemma2').show(6)

+---+--------------------+--------------------+
| id|              lemma1|              lemma2|
+---+--------------------+--------------------+
|  0|step step guide i...|step step guide i...|
|  1|      story kohinoor|would happen indi...|
|  2|increase speed in...|internet speed in...|
|  3|      mentally solve|find remainder di...|
|  4|one dissolve wate...|fish would surviv...|
|  5|capricorn sun cap...|triple capricorn ...|
+---+--------------------+--------------------+
only showing top 6 rows



In [24]:
import gensim
model = gensim.models.KeyedVectors \
.load_word2vec_format('GoogleNews-vectors-negative300.bin',
                      binary=True)

In [25]:
print(model.most_similar(positive=['capital','Iceland'])[0])
print(model.most_similar(positive=['woman', 'king'],
                         negative=['man'])[0])

('Reykjavik', 0.6050446629524231)
('queen', 0.7118192911148071)


In [30]:

print('Words: %s' %words)
M = []
for w in words:
    try: M.append(model[w])
    except: continue
M = np.array(M)
if len(M)==0: question2vec = np.zeros(300)
else: 
    question2vec = M.sum(axis=0)
    norm = np.sqrt((question2vec ** 2).sum())
    if norm>0: question2vec /= norm 

print(question2vec[:5])
len(question2vec)

Words: ['dogs', 'having', 'lower', 'traditional']
[-0.02835612 -0.01235805 -0.0893375   0.1272655  -0.03042328]


300

In [31]:
def quest2vec(words):
    M = []
    for w in words:
        try: M.append(model[w])
        except: continue
    M = np.array(M)
    if len(M)==0: question2vec = np.zeros(300)
    else: 
        question2vec = M.sum(axis=0)
        norm = np.sqrt((question2vec ** 2).sum())
        if norm>0: question2vec /= norm 
    return(question2vec[:5])
q2v_udf = udf(quest2vec, DoubleType())   

In [None]:
def questdiff(a,b): return float(a.squared_distance(b))
dist_udf = udf(questdiff, DoubleType())

AttributeError: 'list' object has no attribute 'show'

In [None]:
for i in ["1","2"]:
    data2 = data.withColumn('q2v'+i, q2v_udf(data['lemma'+i]))

In [33]:
words = ['dogs', 'pets', 'dog', 'puppy']

In [34]:
quest2vec(words)

array([ 0.01432754,  0.00739102, -0.0632403 ,  0.06878085, -0.04557976], dtype=float32)

In [34]:
# add word counts and lengths
for i in ["1","2"]:
    data = data.withColumn('lWCount'+i, wordsCount_udf(data['lemma'+i]))
    data = data.withColumn('qWCount'+i, wordsCount_udf(data['question'+i]))
    data = data.withColumn('lLen'+i, length(data['lemma'+i]))
    data = data.withColumn('qLen'+i, length(data['question'+i]))
data.select('lWCount1','lWCount2','qWCount1','qWCount2',
            'lLen1','lLen2','qLen1','qLen2').show(6)

+--------+--------+--------+--------+-----+-----+-----+-----+
|lWCount1|lWCount2|qWCount1|qWCount2|lLen1|lLen2|qLen1|qLen2|
+--------+--------+--------+--------+-----+-----+-----+-----+
|       6|       5|      14|      12|   35|   28|   66|   57|
|       2|       7|       8|      13|   14|   53|   51|   88|
|       5|       4|      14|      10|   38|   28|   73|   59|
|       2|       3|      11|       9|   14|   21|   50|   65|
|       7|       4|      13|       7|   43|   23|   76|   39|
|       6|       5|      16|      16|   30|   35|   86|   90|
+--------+--------+--------+--------+-----+-----+-----+-----+
only showing top 6 rows



In [35]:
# add ratios for counts and ratios for lengths
data = data.withColumn('lWCount_ratio', ratio_udf(data['lWCount1'],data['lWCount2']))
data = data.withColumn('qWCount_ratio', ratio_udf(data['qWCount1'],data['qWCount2']))
data = data.withColumn('lLen_ratio', ratio_udf(data['lLen1'],data['lLen2']))
data = data.withColumn('qLen_ratio', ratio_udf(data['qLen1'],data['qLen2']))
data.select('lWCount_ratio','qWCount_ratio','lLen_ratio','qLen_ratio').show(6)

+-------------------+-------------------+-------------------+--------------------+
|      lWCount_ratio|      qWCount_ratio|         lLen_ratio|          qLen_ratio|
+-------------------+-------------------+-------------------+--------------------+
| 0.0909090909090909|0.07692307692307693| 0.1111111111111111| 0.07317073170731707|
| 0.5555555555555555|0.23809523809523808|  0.582089552238806| 0.26618705035971224|
|0.11111111111111109|0.16666666666666666|0.15151515151515152| 0.10606060606060606|
|0.19999999999999996|                0.1|                0.2| 0.13043478260869565|
| 0.2727272727272727|                0.3|0.30303030303030304|  0.3217391304347826|
| 0.0909090909090909|                0.0|0.07692307692307693|0.022727272727272728|
+-------------------+-------------------+-------------------+--------------------+
only showing top 6 rows



In [39]:
#N-grams and n-gram ratios (1,2,3)
for i in [1, 2, 3]:
    data = data.withColumn("qNgrams_"+str(i),commonNgrams_udf \
                           (data['question1'],data['question2'],lit(i)))
    data = data.withColumn("lNgrams_"+str(i),commonNgrams_udf \
                           (data['lemma1'],data['lemma2'],lit(i)))
data = data.withColumn('qUnigram_ratio', unigram_ratio_udf \
                       (data.qNgrams_1,data.qWCount1,data.qWCount2))
data = data.withColumn('lUnigram_ratio', unigram_ratio_udf \
                       (data.lNgrams_1,data.lWCount1,data.lWCount2))
data.select('lNgrams_1','lNgrams_2','lNgrams_3', \
            'qNgrams_1','qNgrams_2','qNgrams_3', \
            'qUnigram_ratio','lUnigram_ratio').show(6)

+---------+---------+---------+---------+---------+---------+-------------------+-------------------+
|lNgrams_1|lNgrams_2|lNgrams_3|qNgrams_1|qNgrams_2|qNgrams_3|     qUnigram_ratio|     lUnigram_ratio|
+---------+---------+---------+---------+---------+---------+-------------------+-------------------+
|        4|        4|        3|       11|       11|       10| 0.7333333333333333| 0.5714285714285714|
|        1|        0|        0|        4|        2|        1| 0.2857142857142857|              0.125|
|        3|        0|        0|        4|        1|        0|0.26666666666666666|                0.5|
|        0|        0|        0|        0|        0|        0|                0.0|                0.0|
|        0|        0|        0|        4|        0|        0| 0.2857142857142857|                0.0|
|        3|        0|        0|        9|        4|        1| 0.5294117647058824|0.42857142857142855|
+---------+---------+---------+---------+---------+---------+-------------------+-

In [40]:
# add tokenized lemmas :words1, words2
tokenizer = Tokenizer(inputCol="lemma1", outputCol="words1")
data = tokenizer.transform(data)
tokenizer.setParams(inputCol="lemma2", outputCol="words2")
data = tokenizer.transform(data)
data.select('id','lemma1','words1','lemma2','words2').show(5)

+---+--------------------+--------------------+--------------------+--------------------+
| id|              lemma1|              words1|              lemma2|              words2|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|step step guide i...|[step, step, guid...|step step guide i...|[step, step, guid...|
|  1|      story kohinoor|   [story, kohinoor]|would happen indi...|[would, happen, i...|
|  2|increase speed in...|[increase, speed,...|internet speed in...|[internet, speed,...|
|  3|      mentally solve|   [mentally, solve]|find remainder di...|[find, remainder,...|
|  4|one dissolve wate...|[one, dissolve, w...|fish would surviv...|[fish, would, sur...|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [41]:
# Create corpus 
corpus = data.selectExpr('words1 as words').join(data.selectExpr('words2 as words'), on='words', how='full')
corpus.show(5)
# initialize CountVectorizer
cv = CountVectorizer(inputCol="words", outputCol="tf", minDF=2.0)
# Fit a countVectorizerModel to corpus
cvModel = cv.fit(corpus)
corpus = cvModel.transform(corpus)
corpus.show(5)
# Check # of wods in new vocabulary
print('CountVectorizerModel has a vocabulary of length ',len(cvModel.vocabulary))
# Apply CountVectorizerModel.transform () to question 1 and 2
res1 = cvModel.transform(data.selectExpr('id', 'words1 as words'))
res2 = cvModel.transform(data.selectExpr('id', 'words2 as words'))
res1.show(5)
res2.show(5)


+--------------------+
|               words|
+--------------------+
|[best, available,...|
|     [contact, good]|
|[convince, people...|
|        [creativity]|
|        [find, baby]|
+--------------------+
only showing top 5 rows

+--------------------+--------------------+
|               words|                  tf|
+--------------------+--------------------+
|[best, available,...|(2088,[0,273,1338...|
|     [contact, good]|(2088,[4,578],[1....|
|[convince, people...|(2088,[10,71,1088...|
|        [creativity]|        (2088,[],[])|
|        [find, baby]|(2088,[24,965],[1...|
+--------------------+--------------------+
only showing top 5 rows

CountVectorizerModel has a vocabulary of length  2088
+---+--------------------+--------------------+
| id|               words|                  tf|
+---+--------------------+--------------------+
|  0|[step, step, guid...|(2088,[178,206,22...|
|  1|   [story, kohinoor]|(2088,[522,1562],...|
|  2|[increase, speed,...|(2088,[7,75,251,3...|
|  3|

In [42]:
# Calculate idf
idf = IDF(inputCol="tf", outputCol="idf")
idfModel = idf.fit(corpus)
res1 = idfModel.transform(res1)
res2 = idfModel.transform(res2)
res1.show(5)
res2.show(5)

+---+--------------------+--------------------+--------------------+
| id|               words|                  tf|                 idf|
+---+--------------------+--------------------+--------------------+
|  0|[step, step, guid...|(2088,[178,206,22...|(2088,[178,206,22...|
|  1|   [story, kohinoor]|(2088,[522,1562],...|(2088,[522,1562],...|
|  2|[increase, speed,...|(2088,[7,75,251,3...|(2088,[7,75,251,3...|
|  3|   [mentally, solve]|(2088,[1060,1157]...|(2088,[1060,1157]...|
|  4|[one, dissolve, w...|(2088,[11,270,115...|(2088,[11,270,115...|
+---+--------------------+--------------------+--------------------+
only showing top 5 rows

+---+--------------------+--------------------+--------------------+
| id|               words|                  tf|                 idf|
+---+--------------------+--------------------+--------------------+
|  0|[step, step, guid...|(2088,[178,226,35...|(2088,[178,226,35...|
|  1|[would, happen, i...|(2088,[8,26,45,15...|(2088,[8,26,45,15...|
|  2|[int

In [43]:
# temporary dataframe containing idf1 and idf2 :res
res = res1.selectExpr('id','idf as idf1').join(res2.selectExpr('id','idf as idf2'), on='id', how='inner')
# Calculate the distance
res = res.withColumn('dist', dist_udf(res['idf1'], res['idf2']))
res.show(6)

+---+--------------------+--------------------+------------------+
| id|                idf1|                idf2|              dist|
+---+--------------------+--------------------+------------------+
|  0|(2088,[178,206,22...|(2088,[178,226,35...|31.436004150870346|
|  1|(2088,[522,1562],...|(2088,[8,26,45,15...|121.45324418057099|
|  2|(2088,[7,75,251,3...|(2088,[75,145,251...| 94.38709182458246|
|  3|(2088,[1060,1157]...|(2088,[24],[4.271...| 114.2573847509046|
|  4|(2088,[11,270,115...|(2088,[8,1294,172...| 262.3278971642576|
|  5|(2088,[81,588,998...|(2088,[81,588,172...| 48.00463629773798|
+---+--------------------+--------------------+------------------+
only showing top 6 rows



In [44]:
# Drop columns from data and join in tfidfDistance 
data = data.drop('words1', 'words2')
data = data.join(res.selectExpr('id','dist as tfidfDistance'),on='id',how='inner')
data.select('id','tfidfDistance').show(6)

+---+------------------+
| id|     tfidfDistance|
+---+------------------+
|  0|31.436004150870346|
|  1|121.45324418057099|
|  2| 94.38709182458246|
|  3| 114.2573847509046|
|  4| 262.3278971642576|
|  5| 48.00463629773798|
+---+------------------+
only showing top 6 rows



In [48]:
data.show(2)

+---+--------------------+--------------------+------------+--------------------+--------------------+--------+--------+-----+-----+--------+--------+-----+-----+------------------+-------------------+------------------+-------------------+---------+---------+---------+---------+---------+---------+------------------+------------------+------------------+
| id|           question1|           question2|is_duplicate|              lemma1|              lemma2|lWCount1|qWCount1|lLen1|qLen1|lWCount2|qWCount2|lLen2|qLen2|     lWCount_ratio|      qWCount_ratio|        lLen_ratio|         qLen_ratio|qNgrams_1|lNgrams_1|qNgrams_2|lNgrams_2|qNgrams_3|lNgrams_3|    qUnigram_ratio|    lUnigram_ratio|     tfidfDistance|
+---+--------------------+--------------------+------------+--------------------+--------------------+--------+--------+-----+-----+--------+--------+-----+-----+------------------+-------------------+------------------+-------------------+---------+---------+---------+---------+----

In [52]:
# return first row of dataframe
data = data.select(['id']+featureNames+['is_duplicate'])
data = data.cache()
data.select(['id']+featureNames[:8]).show(6)
data.select(['id']+featureNames[8:12]).show(6)
data.select(['id']+featureNames[12:18]).show(6)
data.select(['id']+featureNames[18:]).show(6)

+---+--------+--------+--------+--------+-----+-----+-----+-----+
| id|lWCount1|lWCount2|qWCount1|qWCount2|lLen1|lLen2|qLen1|qLen2|
+---+--------+--------+--------+--------+-----+-----+-----+-----+
|  0|       6|       5|      14|      12|   35|   28|   66|   57|
|  1|       2|       7|       8|      13|   14|   53|   51|   88|
|  2|       5|       4|      14|      10|   38|   28|   73|   59|
|  3|       2|       3|      11|       9|   14|   21|   50|   65|
|  4|       7|       4|      13|       7|   43|   23|   76|   39|
|  5|       6|       5|      16|      16|   30|   35|   86|   90|
+---+--------+--------+--------+--------+-----+-----+-----+-----+
only showing top 6 rows

+---+-------------------+-------------------+-------------------+--------------------+
| id|      lWCount_ratio|      qWCount_ratio|         lLen_ratio|          qLen_ratio|
+---+-------------------+-------------------+-------------------+--------------------+
|  0| 0.0909090909090909|0.07692307692307693| 0.111111

In [53]:
# Save train and test features to separate .csv files
outData = data.select(['id']+featureNames+['is_duplicate'])
outData = outData.cache()
outTrainFileName = "train_features2.csv"
outTestFileName = "test_features2.csv"
outData.filter(outData.id <= maxTrainID).\
coalesce(1).write.csv(outTrainFileName,header=True,mode='overwrite',quote="")
outData.filter(outData.id > maxTrainID).withColumn('id', outData.id-maxTrainID-1).\
coalesce(1).write.csv(outTestFileName,header=True,mode='overwrite',quote="")


