## Read Pre-Trained Embeddings into Dictionary 

In [1]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/kat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from string import digits
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer
import math

In [3]:
embeddings_dict = {}
with open("glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [4]:
def data_preprocessing(inputData):
    dataset = pd.read_csv(inputData)
    dataset = dataset[dataset['grades'] != 0] # remove entries with 0 grades
    dataset['replaced_sentence'] = "" #replace word in news headline
    storage_array = []
    for index, row in dataset.iterrows():
        new = re.sub('<.*/>', row['edit'], row['original'], flags=re.DOTALL)
        storage_array.append(new)
    dataset['replaced_sentence'] = storage_array
    # make characters lowercase
    dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    stop = stopwords.words('english') #remove stop words
    dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    dataset['replaced_sentence'] = dataset['replaced_sentence'].str.replace('[^\w\s]','')
    frequent_words = pd.Series(' '.join(dataset['replaced_sentence']).split()).value_counts()[:10] # remove common words
    words_to_remove = ['s', 'nt', 'nd']
    dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove))
    
    # new
    dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x in embeddings_dict))
    
    rare = pd.Series(' '.join(dataset['replaced_sentence']).split()).value_counts()[-10:]
    rare = list(rare.index) #remove rare words
    dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
    dataset['replaced_sentence'] = dataset['replaced_sentence'].str.replace('\d+', '')
    dataset = dataset.drop(['original', 'edit', 'grades'], axis=1) #drop unnecessary cols
    
    return dataset

In [5]:
processed_data = data_preprocessing("train.csv")
processed_data.head()

Unnamed: 0,id,meanGrade,replaced_sentence
0,14530,0.2,france hunting citizens joined twins without t...
1,13034,1.6,pentagon claims increase russian trolls bowli...
2,8731,1.0,iceland pm calls snap vote pedophile furor cra...
3,76,0.4,apparent first iran israel slap militarily
5,8832,1.2,sounds trump made speech congress one chart


In [6]:
rdd_data = spark.createDataFrame(processed_data)\
                .rdd

## Calculating the Sentence Embeddings

In [7]:
id_with_sentence = rdd_data.map(lambda input: (input.id, input.replaced_sentence.split()))
#id_with_sentence.collect()

Tokenize assigns words to their 50-d word vectors and divides by the number of words in the sentence... this is part of the process for getting the averages for the sentence embeddings.

In [8]:
def tokenize(input):
    result = []
    for x in input[1]: 
        length = len(input[1])                                     # gets number of words in sentence
        embedding = []
        for i in list(embeddings_dict[x]):
            embedding.append(float(i / length))
        result.append((input[0],embedding))
    return result

id_with_word = id_with_sentence.flatMap(tokenize)

In [9]:
#id_with_word.collect()

In [10]:
from functools import reduce

def myReducer(x,y):
    count = 0
    result = []
    for i in range(0, len(x)):
        result.append(x[i] + y[i])
    return result

In [11]:
id_groups = id_with_word.reduceByKey(myReducer)

In [12]:
#ID GROUPS IS THE FINAL PRODUCT
#id_groups.collect()

## Getting the Ratings Mapping

In [13]:
id_with_grade = rdd_data.map(lambda input: (input.id, input.meanGrade))
#id_with_grade.collect()

## Prepare for Machine Learning

In [14]:
from pyspark.sql import Row

grades = id_with_grade.map(lambda x: Row(identif=x[0], grade=x[1]))
schemaGrades = sqlContext.createDataFrame(grades)
schemaGrades.show(5)

+-----+-------+
|grade|identif|
+-----+-------+
|  0.2|  14530|
|  1.6|  13034|
|  1.0|   8731|
|  0.4|     76|
|  1.2|   8832|
+-----+-------+
only showing top 5 rows



In [15]:
sentence_embeddings = id_groups.map(lambda x: Row(identif=x[0], embed=x[1]))
#sentence_embeddings.collect()[0]
schemaEmbed = sqlContext.createDataFrame(sentence_embeddings)
schemaEmbed.show(5)


+--------------------+-------+
|               embed|identif|
+--------------------+-------+
|[0.03392983662585...|     76|
|[0.44911583544065...|     68|
|[-0.1882088308533...|   4136|
|[-0.5004666546980...|   7080|
|[0.31943100318312...|  10224|
+--------------------+-------+
only showing top 5 rows



In [16]:
df = schemaGrades.join(schemaEmbed,on='identif',how='left').select(['grade','embed'])

df.show(5)

+-----+--------------------+
|grade|               embed|
+-----+--------------------+
|  1.2|[-0.0015900052256...|
|  0.8|[-0.0339350104331...|
|  1.2|[-0.2413649982255...|
|  2.2|[0.05566571706107...|
|  1.2|[-0.0196700021624...|
+-----+--------------------+
only showing top 5 rows



## Prep for Machine Learning

In [20]:
## see https://stackoverflow.com/questions/39025707/how-to-convert-arraytype-to-densevector-in-pyspark-dataframe
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

grade_rdd = df.rdd.map(lambda row:row[0])
embed_rdd = df.rdd.map(lambda row:row[1])
new_df = grade_rdd.zip(embed_rdd.map(lambda x:Vectors.dense(x))).toDF(schema=['grade','embed'])

new_df.show(5)


+-----+--------------------+
|grade|               embed|
+-----+--------------------+
|  1.2|[-0.0015900052256...|
|  0.8|[-0.0339350104331...|
|  1.2|[-0.2413649982255...|
|  2.2|[0.05566571706107...|
|  1.2|[-0.0196700021624...|
+-----+--------------------+
only showing top 5 rows



In [21]:
splits = new_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [22]:
train_df.show(5)

+-----+--------------------+
|grade|               embed|
+-----+--------------------+
|  0.2|[0.00719059556722...|
|  0.2|[0.39878216525539...|
|  0.4|[-0.1359110943973...|
|  0.4|[-0.1213597480673...|
|  0.4|[0.05640549398958...|
+-----+--------------------+
only showing top 5 rows

