In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# read data from tables
apis_df = pd.read_csv('../datasets/apis.csv', usecols = [0, 1, 2, 3, 5])
mashups_df = pd.read_csv('../datasets/mashups.csv', usecols = [0, 1, 2, 3, 4])

# drop rows that contain null values
apis_df.dropna(inplace = True)
mashups_df.dropna(inplace = True)

# correct index numbers
apis_df.reset_index(drop = True, inplace = True)
mashups_df.reset_index(drop = True, inplace = True)

In [2]:
# create text filter list of English stopwords and special characters
text_filter = list(stopwords.words("english"))
special_characters = [',', '/', '-', '.', ';']
for char in special_characters:
  text_filter.append(char)

# initialize text stemmer
porter = PorterStemmer()

# tokenize, apply filter to, and stem text
def text_filter_function(text):
  result = ''
  tokens = word_tokenize(str(text))
  for token in tokens:
    if token not in text_filter:
      result += porter.stem(token.lower())
      result += ' '
  return result

# process the description column to create new bag of words column
apis_df['description_words'] = apis_df['categories'] + ' ' + apis_df['description']
apis_df['description_words'] = apis_df['description_words'].apply(text_filter_function)
mashups_df['description_words'] = mashups_df['tagList'] + ' ' + mashups_df['description']
mashups_df['description_words'] = mashups_df['description_words'].apply(text_filter_function)

In [3]:
query = 'I want some APIs with map, music and social media sharing function'

# add a new row to the dfs
new_mashups_df = mashups_df.append({ 'description_words': text_filter_function(query) }, ignore_index = True)
query_mashup_id = new_mashups_df.shape[0] - 1

new_apis_df = apis_df.append({ 'description_words': text_filter_function(query) }, ignore_index = True)
query_api_id = new_apis_df.shape[0] - 1

In [5]:
mashups_df.tail()

Unnamed: 0,id,mashup,apiList,tagList,description,description_words
5391,7860,Xbox Radar,18233;289;2;11,eCommerce;Games,Xbox Radar helps people find Xbox 360 products...,ecommerc game xbox radar help peopl find xbox ...
5392,7861,America 24 7,4,Mapping;Photos,MSN Virtual Earth used to plot photos from the...,map photo msn virtual earth use plot photo hre...
5393,7862,LinkPut,18201,Search;Wiki,LinkPut is an experiment that attempts to crea...,search wiki linkput experi attempt creat bette...
5394,7867,HotOrNot Google Maps,18305,Mapping;Dating,Find people to meet on HotOrNot.com by browsin...,map date find peopl meet hotornot.com brows go...
5395,7869,flickr graph,1,Photos;Visualizations,Social network visualization using Flickr API.,photo visual social network visual use flickr ...


In [4]:
tf_idf = TfidfVectorizer(analyzer = str.split, max_features = 10000)

mashup_description_matrix = tf_idf.fit_transform(mashups_df['description_words']).toarray()
mashup_cos_sim_matrix = cosine_similarity(mashup_description_matrix, mashup_description_matrix)

api_description_matrix = tf_idf.fit_transform(apis_df['description_words']).toarray()

In [45]:
# return APIs with top scores in the top k related mashups of the query
def api_scores_for_top_k_mashups(mashup_id, k = 15):
  # sort the top related mashups descending using cosine similarity score, and pick the first k elements
  score_series = pd.Series(mashup_cos_sim_matrix[mashup_id]).sort_values(ascending = False).iloc[1:k+1]

  apis_in_top_k_mashups = []
  for i in range(k):
    # retrieve a list of API id's from mashups_df
    api_list = mashups_df.iloc[score_series.index[i]]['apiList'].split(';')
    api_list = [int(api) for api in api_list]
    # append the list of API id's
    apis_in_top_k_mashups.append(api_list)

  # create a list of unrepeated API id's
  all_apis = []
  for i in range(k):
    for api in apis_in_top_k_mashups[i]:
      if api not in all_apis:
        all_apis.append(api)

  # for each API, if it is used in one of the top k related mashups, add its count by 1
  count = {}
  for i in range(len(all_apis)):
    count[all_apis[i]] = 0
    for j in range(k):
      if all_apis[i] in apis_in_top_k_mashups[j]:
        count[all_apis[i]] += 1

  # each element in api_scores will be API id and its ranking score (its count divided by k times 100)
  api_scores = []
  for i in range(len(all_apis)):
    api_scores.append([all_apis[i], count[all_apis[i]]])

  # sort api_scores by ranking score from highest to lowest
  api_scores = sorted(api_scores, reverse = True, key = lambda x: x[1])[:10]

  # return a dictionary of API's name, descrition and url
  recommendations = []
  for i in range(len(api_scores)):
    api = apis_df.loc[apis_df['id'] == api_scores[i][0]].values[0]
    recommendations.append({ 'name': api[1], 'description': api[3], 'url': api[4], 'count': api_scores[i][1] })

  return recommendations

In [40]:
apis_df.loc[apis_df['id'] == 2].values[0][1]

'Amazon Product Advertising API'

In [51]:
# recommend the top k related APIs to the query
def api_scores_for_top_k_apis(api_id, k = 15):
  api_scores = []

  # for all API entries except the last one (which is our query)
  for i in range(apis_df.shape[0] - 1):
    # calculate the cosine similarity between all mashups and the query
    cos_sim_score_i = cosine_similarity([api_description_matrix[api_id]], [api_description_matrix[i]])[0][0]
    # append index and related cosine similarity score
    api_scores.append([i, cos_sim_score_i])

  # sort the array descending using cosine similarity score, and pick the first k elements
  api_scores = sorted(api_scores, reverse = True, key = lambda x: x[1])[:k]

  recommendations = []
  for i in range(len(api_scores)):
    # retrieve API's name from apis_df
    api = apis_df.loc[apis_df['id'] == api_scores[i][0]].values[0]
    # append a tuple of API name and its cosine similarity score
    recommendations.append({ 'name': api[1], 'description': api[3], 'url': api[4], 'count': api_scores[i][1] })

  return recommendations

[{'name': 'OpenStack Identity API',
  'description': 'OpenStack is a provider of various open source project components that facilitate the establishment of cloud services. It has provided standard APIs that are well suited for multiple implementations. The OpenStack Identity API supports the validation of user access credentials. The API supplies the authentication tokens that users must furnish prior to gaining access permissions for OpenStack APIs and services. The API transmits HTTP-formatted requests and responses.',
  'url': 'openstack-identity',
  'count': 0.3872930461889199},
 {'name': 'hon.jp API',
  'description': 'Requires Japanese-enabled PC. This API is provided by Japanese eBook search site HON.JP. Kanji-enabled users can access our proprietary Japanese ebook metadata database, covering titles sold in PC, WAP phones, eBook readers, etc. Upgraded to ver.2.0 on 2007-11-01 and improved on reponsiveness along with new methods such as  sort ,  page  and  mode .',
  'url': 'hon