In [2]:
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import *
from pyspark.sql.functions import split, col, struct, udf
from pyspark.sql import Row
import sys
#import google_compute_engine
import gensim
from gensim import corpora,models,similarities
from gensim.matutils import softcossim 
from gensim.utils import simple_preprocess
import numpy as np
from scipy.sparse import csr_matrix

## load Spark

In [None]:
conf = SparkConf().setAppName("Final")
sc = SparkContext(conf = conf)
spark = SparkSession(sparkContext=sc)

 ## load word embeddings model

In [None]:
sc.addFile("gs://wiki_final/subword.vec")
model = gensim.models.KeyedVectors.load_word2vec_format(SparkFiles.get("subword.vec"))

## load wiki dataset

In [None]:
#xml = spark.read.format('xml').options(rowTag="page").load('gs://wiki_final/big_data.xml.bz2')
xml = spark.read.format('xml').options(rowTag="page").load('gs://wiki_final/Wikipedia-test-SUBSET.xml')

## pre process data

In [None]:
#function used in map function
def getText(row):
        #complex struct structure to get text field 
        s = row.revision.text._VALUE
        #return text a id(used for join)
        return   Row(title =row.title, text= prepText(s) ,id_= row.id)

def prepText(s):
        punc='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'
        lowercased_str = str(s).lower()
        for ch in punc:
            lowercased_str = lowercased_str.replace(ch, ' ')
        return rmSp(lowercased_str.split(' '))
                            
def rmSp(x):
        r = []
        for w in x:
            if w!='':
                r.append(w)
        return r

#df -> rdd to transform the data into sql Rows and make text array
textRDD = xml.select('revision', 'id', 'title').rdd.map(getText)#makes a DF with text and ID 
#back to df to strip stopwords 
t = textRDD.toDF()
remover = StopWordsRemover(inputCol="text", outputCol="filtered")
stop_removed =  remover.transform(t)

#New data with id_, title, text, filtered
stop_removed.show()

## helper functions for splitting data and calculating article vectors

In [None]:
def article_shape_fun(article):
    article_vector = [0.0] *300
    for word in article:
        article_vector += model.filter(lambda x: x.word == word).map(lambda x: x.vec).take(1)
    return article_vector

def str2arr(v):
    arr = []
    m = v.split(' ')
    for n in m[1:]:
        n = re.sub('[^0-9]','', n)
        arr.append(float(n))
    return arr

def split_vec(x):
    s = ''
    v = ''
    for l in str(x.encode('utf8')):
        if l.isalpha():
            s+= l
        else:
            v += l
    return Row(word = s, vec = str2arr(v)) 

## df -> rdd to transform the data into sql Rows and make text array

In [None]:
textRDD = xml.select('revision', 'id', 'title').rdd.map(getText)#makes a DF with text and ID 
#back to df to strip stopwords 
t = textRDD.toDF()
remover = StopWordsRemover(inputCol="text", outputCol="filtered")
stop_removed =  remover.transform(t)

## partition data to attempt to get some results

In [None]:
sub = stop_removed.rdd.map(lambda x: Row(id_ = x.id_, text = x.filtered)).take(100)

## generate article vectors

In [None]:
all_shapes = sc.emptyRDD()
for article in sub:
    article_shape = sc.parallelize(Row(id_= article.id_, shape = article_shape_fun(article.text)))
    all_shapes = all_shapes.union(article_shape)
    print(article.id_)
print("shapes done")

## for each article:
* calculate cosine similarities
* sort similarities
* save top 10 similarities 

In [None]:
#loop over Article shapes and get the top 10 Recomendations
all_recs = sc.emptyRDD()
for article in all_shapes.collect():
    recs = all_shapes.map(lambda x: Row(id_= x.id, sim= 1 - spatial.distance.cosine(article.shape, x.shape))).sortBy(lambda x: x.sim, ascending=False).take(11)
    article_recs = sc.parallelize((article.id,recs))
    all_recs = all_recs.union(article_recs)


#save results RDD to bucket
all_recs.saveAsTextFile("gs://wiki_final/rec_id")
