In [1]:
import pandas as pd
import numpy  as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lindachen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:

#file = open('../Resources/' + filename, 'r', encoding="ISO-8859-1")
df = pd.read_csv("data/EA_CSV.csv", engine="python", encoding="UTF-8")

In [7]:
#turn the first row into column
def get_data(df):
    #combine the description
    df['description'] = df['Expertise'].fillna('') + " " + df['Title'].fillna('')
    return df
df = get_data(df)

In [43]:
def clean_text(s):
    #removeNonAscii
    s = "".join(i for i in s if ord(i)<128)

    #return all lower cases
    s = s.lower()

    #remove stop wrods
    s = s.split()
    stops = set(stopwords.words("english"))
    text = [w for w in s if not w in stops]
    text = " ".join(text)

    #remove html
    html_pattern = re.compile('<.*?>')
    text = html_pattern.sub(r'', text)

    #remove punctuation
    text = re.sub(r'[^\w\s]'," ",text)
    return text



In [64]:
def glove_model(file_path='glove.twitter.27B.200d.txt'):
  """
  input: the access to the file which contains the pre-trained glove model
  ouput: a trained model using glove
  """

  embeddings_index = {}
  f = open(file_path, encoding='utf-8')
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
  f.close()
  return embeddings_index

In [65]:
def glove2vec(text, model):
    """
    input: 
    - text: the string that you wanted to turn into vectors
    - model: the glove pre-trained model that you wanted to use

    output:
    an average vector of your input text using the glove model

    """
    
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []

    vectors = {}

    avgword2vec = None
    count = 0

    for word in text.split():
        if word in model:
            count += 1
            if avgword2vec is None:
                avgword2vec = model[word]
            else:
                avgword2vec = avgword2vec + model[word]
            
    if avgword2vec is not None:
        avgword2vec = avgword2vec / count
    
    word_embeddings.append(avgword2vec)

    return word_embeddings

In [85]:
def recommendations_glove(input, df=df, col='description'):
    """
    input: 
    - input: key words relating to the article
    - df: the dataframe that we are using
    - col: the column name that contains description of EAs
    - file path: the file path to the pre-trained glove model

    output:
    a list of EAs' names ranked by the most recommended / most suitable EA
    to the least
    """

    #get the model (it is too slow to compute the model every time)
    model = glove_model(file_path='glove.twitter.27B.200d.txt')

    # vecterizd the input
    vector_input = glove2vec(input, model)

    # vecterized all the descriptions of EAs
    df[col] = df[col].apply(clean_text)
    EAs_vectors = [glove2vec(EA, model) for EA in df[col]]

    #drop an EA if that EA has "None" vector
    df['vectors'] = EAs_vectors
    for i, n in enumerate(df['vectors']):
      if n[0] is None:
        df = df.drop([i+1], axis=0)


    # finding cosine similarity for the vectors
    similarity = []
    for n in df['vectors']:
      scores = cosine_similarity(vector_input, n)[0][0]
      similarity.append(scores)
    
    df['similarity'] = similarity


    # sort and find the recommended movie
    df = df.sort_values(by=['similarity'], ascending=False)
    #res_df = df.iloc[:m]


    return df['Names']

In [None]:
def recomm_engine(keywords, vec_df, model):
    unpickled_model = pd.read_pickle("models/glove_model.pkl")
    
    # vectorize the input
    input_vector = glove2vec(keywords, model)

    # finding cosine similarity for the vectors
    similarity = []
    for n in vec_df['vectors']:
        scores = cosine_similarity(input_vector, n)[0][0]
        similarity.append(scores)

    vec_df['similarity'] = similarity

    # sort and find the recommended movie
    vec_df = vec_df.sort_values(by=['similarity'], ascending=False)

    return vec_df.loc[::, :'description']

In [88]:
input = 'data collection, data ethics issues/initial assumptions'
res = recommendations_glove(input=input, df=df, col='description')

print(res)

26            Carlos Ortega
23        Sergio Betancourt
27           Julia Nikulski
20             Rohan Joseph
10               David Chen
4           Gitansh Khirbat
17              Mael Fabien
15             Hessie Jones
5           Abdullah Farouk
12          Francisco Nunes
8            Arman Didandeh
16             Lester Leong
19      Pier Paolo Ippolito
2               Sophie Mann
25        François St-Amant
1            Lowri Williams
9             Carlos Mougan
21            John Jagtiani
13         Sara A. Metwalli
31               Agni Kumar
22                 Rui Geng
7          Anton Muehlemann
14         Gerasimos Plegas
32                Dean Deng
0                Amber Teng
3                   Jingles
24             Sohaib Ahmad
11    Dimitris Panagopoulos
29          Purvanshi Mehta
6     Andrew DeCotiis-Mauro
30         Ashwin Hariharan
18            Manish Sharma
28         Marin Vlastelica
Name: Names, dtype: object
