<a href="https://colab.research.google.com/github/karinboc/Wiki_Similarity/blob/main/KNN_wiki_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wikipedia

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-cp37-none-any.whl size=11686 sha256=a84f07e5171bc9dfccbe51ec3f6b4fc619b1882a63abf371523d30827fd57dcd
  Stored in directory: /root/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
import sys
import wikipedia
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

#------------------------
# Wikipedia API Client
class WikiAPI():

  def __init__(self):
    self.corpus = []
    self.titles = []
     
  # removes stop words / punctuation
  def get_tokenize(self, text):
      stop_words = set(stopwords.words('english'))
      sentence = []

      tokenizer = RegexpTokenizer(r'\w+')
      filtered_text = tokenizer.tokenize(text)

      for item in filtered_text:
          if item not in stop_words:
              sentence.append(item)

      return sentence

  # Debug - creates an article for testing (input: keyword that creates test article)
  def get_article_from_keyword(self, keyword):
      try:
        p = wikipedia.WikipediaPage(wikipedia.search(keyword, results=1, suggestion=False)[0]).content
      except wikipedia.exceptions.DisambiguationError as e:
        p = wikipedia.WikipediaPage(e.options[0]).content
      return [p]

  # input: an article | output: most common word that are part of the input article (exclude stop words)
  # Due to Wiki limitations we chose up to 300 len of our input article -
  def text_to_query(self, article, max_query_len=300):
      text = [" ".join(self.get_tokenize(txt.lower())) for txt in article]
      vectorizer = TfidfVectorizer()
      matrix = vectorizer.fit_transform(text).todense()
      # # # transform the matrix to a pandas df
      matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
      # # # sum over each document (axis=0)
      top_words = matrix.sum(axis=0).sort_values(ascending=False)

      query_words = ""  #(?:one|two|three)
      string_len = 0
      for k in top_words.keys():
        if string_len + len(k) < max_query_len:  #Wiki limitations: max query len = 300
          query_words = query_words + k + " "

          string_len += len(k) + 1 # +1 for space char

      return  query_words
  # Creates partial wiki corpus based on our input article
  def create_corpus(self, query_words):
      serach_results = wikipedia.search(query_words, results=10, suggestion=False)
      self.corpus = [] 
      self.titles = []     
      for res in serach_results:
        try:
            self.corpus += [wikipedia.WikipediaPage(res).content]            
            self.titles += [wikipedia.WikipediaPage(res).title]
        except wikipedia.exceptions.DisambiguationError as e:
            self.corpus += [wikipedia.WikipediaPage(e.options[0]).content] 
            self.titles += [wikipedia.WikipediaPage(e.options[0]).title]
            
      return self.titles, self.corpus  



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#------------------------
# Similar Article Finder
# (tf-idf public python packages)

class SimilarArticleFinder():

  def __init__(self, titles, corpus):
    self.titles = titles
    self.corpus = corpus
    
    # #Corpus: Chosen Wiki articles (according to input article)
    # # Convert a collection of raw corpus to a matrix of TF-IDF features
    self.vectorizer = TfidfVectorizer()
    self.matrix = self.vectorizer.fit_transform(self.corpus).todense()
    # transform the matrix to a pandas df
    self.corpus_cols = self.vectorizer.get_feature_names()
    self.matrix = pd.DataFrame(self.matrix, columns=self.vectorizer.get_feature_names())

  def print_corpus(self):
    print(self.corpus)

  ## Similarity Calculation <a,b> = |a| |b| cos(a,b)
  # Calculating similarity using the cosine similarity between the two TF-IDF features matrix (k: number of top similarities presented)
  def top_k_similarities(self, query,k=10):
    # Query: title, input article
    # query = [" ".join(tokenize(txt.lower())) for txt in query]
    # Convert a collection of raw text to a matrix of TF-IDF features
    query_matrix = self.vectorizer.fit_transform(query).todense()
    query_matrix = pd.DataFrame(query_matrix, columns=self.vectorizer.get_feature_names())
    # Subset common words from query and corpus
    subtext = [set(self.corpus_cols),set(self.vectorizer.get_feature_names())]
    cols = list(set.intersection(*subtext))
    sub_corpus = pd.DataFrame(self.matrix, columns=cols)
    query_matrix = pd.DataFrame(query_matrix,  columns=cols)
    # Ignore qurey words that are not in the corpus
    query_subset = query_matrix.values[0]
    # Dot product for sub query and corpus
    rank = sub_corpus.dot(query_subset)

    best_match_index = rank.sort_values(ascending=False)[:k]
    best_match_index_asc = rank.sort_values(ascending=True)[:k]
    t = [x for _, x in sorted(zip(best_match_index_asc, self.titles))]
    return best_match_index, t


In [None]:
wiki_api = WikiAPI()
article = wiki_api.get_article_from_keyword("Artlist")
article

['Artlist Collection: The Dog and Friends (better known as The Dog and Friends or simply The Dog) is a franchise created by Artlist in Japan in 2000. Initially known as Artlist Collection: The Dog, the franchise began as collection of calendars and postcards of dogs photographed with a fisheye lens. Afterwards, the franchise became so popular that new animals such as cats, pigs, rabbits, ducks, hamsters, and birds were added to the collection. In 2003, 4Kids Entertainment bought the rights for this franchise outside of Asia. In 2004, to celebrate 25 years of the McDonald\'s Happy Meal, toys of this franchise were sold as Happy Meals from April 2 to April 29. Select locations featured dog adoption events. They were sold once again the next year, but with "The Cat" plushes. In 2007, the series introduced its first video game, The Dog Island. In 2009, the series also released a pet simulation game on mobile phones and smart phones. In 2016, they were featured as themes for the Japanese Ni

In [None]:
wiki_api.text_to_query(article)

'dog franchise collection artlist happy april featured known game phones friends sold series 3ds entertainment created 2009 dogs ducks 2007 events cats external 2004 first fisheye 2003 celebrate cat 4kids calendars bought birds better began 2016 25 asia 29 animals also afterwards adoption added year '

In [None]:
  max_query_len = 300
  titles = []   
  corpus = []   
  input_article = article
  while(len(corpus) < 10 and max_query_len > 0):
    query_words = wiki_api.text_to_query(input_article, max_query_len)
    titles, corpus = wiki_api.create_corpus(query_words)
    max_query_len -= 10

corpus



  lis = BeautifulSoup(html).find_all('li')


['Artlist Collection: The Dog and Friends (better known as The Dog and Friends or simply The Dog) is a franchise created by Artlist in Japan in 2000. Initially known as Artlist Collection: The Dog, the franchise began as collection of calendars and postcards of dogs photographed with a fisheye lens. Afterwards, the franchise became so popular that new animals such as cats, pigs, rabbits, ducks, hamsters, and birds were added to the collection. In 2003, 4Kids Entertainment bought the rights for this franchise outside of Asia. In 2004, to celebrate 25 years of the McDonald\'s Happy Meal, toys of this franchise were sold as Happy Meals from April 2 to April 29. Select locations featured dog adoption events. They were sold once again the next year, but with "The Cat" plushes. In 2007, the series introduced its first video game, The Dog Island. In 2009, the series also released a pet simulation game on mobile phones and smart phones. In 2016, they were featured as themes for the Japanese Ni

In [None]:
corpus[0]

'Artlist Collection: The Dog and Friends (better known as The Dog and Friends or simply The Dog) is a franchise created by Artlist in Japan in 2000. Initially known as Artlist Collection: The Dog, the franchise began as collection of calendars and postcards of dogs photographed with a fisheye lens. Afterwards, the franchise became so popular that new animals such as cats, pigs, rabbits, ducks, hamsters, and birds were added to the collection. In 2003, 4Kids Entertainment bought the rights for this franchise outside of Asia. In 2004, to celebrate 25 years of the McDonald\'s Happy Meal, toys of this franchise were sold as Happy Meals from April 2 to April 29. Select locations featured dog adoption events. They were sold once again the next year, but with "The Cat" plushes. In 2007, the series introduced its first video game, The Dog Island. In 2009, the series also released a pet simulation game on mobile phones and smart phones. In 2016, they were featured as themes for the Japanese Nin

In [None]:
 saf = SimilarArticleFinder(titles,corpus)
 print(saf.top_k_similarities(input_article,10))

(0    0.949497
9    0.716999
6    0.681358
4    0.673584
2    0.663590
1    0.647763
8    0.644434
7    0.602841
3    0.290886
5    0.110309
dtype: float64, ['Artlist Collection: The Dog and Friends', 'Jak and Daxter', 'Jak and Daxter Collection', 'List of best-selling video game franchises', 'Naughty Dog', 'List of video game franchises', 'Uncharted: The Nathan Drake Collection', 'God of War (franchise)', 'Crash Bandicoot', 'Uncharted'])


In [None]:
### In case we wish to run using py file

In [None]:
  #Output: 10 similar articles from our partial wiki corpus
def run(path):
    wiki_api = WikiAPI()
    max_query_len = 300
    corpus = []
    with open(path) as f:
        input_article = list(f.readlines())
        while(len(corpus) < 10 and max_query_len > 0):
          query_words = wiki_api.text_to_query(input_article, max_query_len)
          corpus = wiki_api.create_corpus(query_words)
          max_query_len -= 10

        saf = SimilarArticleFinder(corpus)
        print(saf.top_k_similarities(input_article,10))

if __name__ == '__main__':
    #Input path article
    if len(sys.argv) != 2:
        print('please enter path to file')
        sys.exit()

    path = str(sys.argv[1])
    print(path)
    run(path)