In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# read data from tables
apis_df = pd.read_csv("../datasets/apis.csv", usecols = [0, 1, 2, 3])
mashups_df = pd.read_csv("../datasets/mashups.csv", usecols = [0, 1, 2, 3, 4])

# create text filter list of English stopwords and special characters
text_filter = list(stopwords.words("english"))
special_characters = [",", "/", "-", ".", ";"]
for char in special_characters:
  text_filter.append(char)

# initialize text stemmer
porter = PorterStemmer()

# tokenize, apply filter to, and stem text
def text_filter_function(text):
  result = ""
  tokens = word_tokenize(str(text))
  for token in tokens:
    if token not in text_filter:
      result += porter.stem(token.lower())
      result += " "
  return result

# process the description column to create new bag of words column
apis_df["description_words"] = apis_df["description"].apply(text_filter_function)
# mashups_df["description_words"] = mashups_df["description"].apply(text_filter_function)

In [2]:
tf_idf = TfidfVectorizer(analyzer = str.split)
api_description_matrix = tf_idf.fit_transform(apis_df["description_words"])

In [10]:
# recommend the top k related APIs to a given API
def recommend_apis(api_id, k = 10):
  recommendations = []
  top_k_recommendations = []

  # for all API entries
  for i in range(api_description_matrix.shape[0]):
    # if it's not the same as selected API
    if i != api_id:
      # calculate the cosine similarity between all other APIs and selected API
      cos_sim_score_i = cosine_similarity(api_description_matrix[api_id], api_description_matrix[i])[0][0]
      # append index and related cosine similarity score
      top_k_recommendations.append((i, cos_sim_score_i))

  # sort the array descending using cosine similarity score, and pick the first k elements
  top_k_recommendations = sorted(top_k_recommendations, reverse = True, key = lambda x: x[1])[:k]

  for i in top_k_recommendations:
    # retrieve API's name from apis_df
    api_name = apis_df.iloc[i[0]]["api"]
    cos_sim_score = i[1]
    # append a tuple of API name and its cosine similarity score
    recommendations.append((api_name, cos_sim_score))

  api_name = apis_df.iloc[api_id]["api"]
  print("The top " + str(k) + " recommended APIs and their cosine similarity score for " + api_name + " is:\n")

  return recommendations

# test recommendation
recommend_apis(3)

The top 10 recommended APIs and their cosine similarity score for API: Microsoft Bing Maps API is:



[('Bing Static Maps API', 0.5878079074616477),
 ('Bing Maps Location Data API', 0.4769642664039478),
 ('Bing Maps API', 0.4344878866859619),
 ('South African YellowPages Maps API', 0.42634012841653235),
 ('TomTom Maps API', 0.420441990775066),
 ('NAC Real-time Mapping API', 0.4021403497873083),
 ('Map Data Services QuickMap API', 0.3924264814192068),
 ('Google Maps for Work API', 0.3890421173426575),
 ('Google Maps Flash API', 0.3887042650391142),
 ('Ericsson Web Maps API', 0.3841654058048892)]