# Lacrosse Page Similarity on Wikipedia



### Import packages

In [49]:
import wikipedia as wk
import os
import numpy as np 
import pandas as pd
from sklearn.manifold import MDS as mds
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance_matrix
from bokeh.io import output_file, show
from bokeh.plotting import figure, save
from bokeh.models import HoverTool, TapTool, OpenURL, Label

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ocamp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Get a list of pages for search of lacrosse

In [2]:
search_term = 'lacrosse'
results_limit = 100

page_list = wk.search(search_term, results = results_limit)
print('Pages returned for lacrosse:' + str(len(page_list)) + '\n')
for i in page_list: 
    print(i)

Pages returned for lacrosse:100

Lacrosse
Lacrosse (disambiguation)
Box lacrosse
Buick LaCrosse
College lacrosse
List of NCAA Division II lacrosse programs
Lacrosse ball
Field lacrosse
Murder of Yeardley Love
List of NCAA Division I lacrosse programs
Crystal Mangum
Lacrosse (satellite)
National Lacrosse League
Women's lacrosse
NCAA Division I Women's Lacrosse Championship
Duke lacrosse case
NCAA Division I Men's Lacrosse Championship
Major League Lacrosse
History of lacrosse
Lacrosse stick
San Diego Seals
Myles Jones
Jordan Levine
Johns Hopkins Blue Jays men's lacrosse
Johns Hopkins Blue Jays
Steven Brooks (lacrosse)
Lacrosse in Canada
Paul Rabil
List of Major League Lacrosse awards
Joey Cupido
UMass Minutemen lacrosse
Miles Thompson
Iroquois men's national lacrosse team
NCAA Men's Lacrosse Championship
Professional Lacrosse League
Lacrosse National Hall of Fame and Museum
Dillon Ward
IMG Academy
Maverik Lacrosse
David Morrow (sports)
Blake Miller (lacrosse)
Mark Matthews (lacrosse)
Ra

### Return content and urls

In [70]:
content = []
url = []
returned_pages = []

for i in page_list: 
    try:
        content.append(wk.page(i).content)
        url.append(wk.page(i).url)
        returned_pages.append(i)
    except:
        print('Failure on \'' + i + '\'')
        
print('\n' + 'Content size: ' + str(len(content)) + '\n')
print('URL size: ' + str(len(url)) + '\n')



  lis = BeautifulSoup(html).find_all('li')


Failure on 'Lacrosse (disambiguation)'
Failure on 'Kevin Huntley (lacrosse)'

Content size: 98

URL size: 98



### Process text 


In [4]:
cleaned_content = []

stop_words = set(stopwords.words('english'))

rt = RegexpTokenizer(r'\w+')

ps = PorterStemmer()

for i in content: 
    cleaned_content.append([ps.stem(w) for w in rt.tokenize(i.lower()) if not w in stop_words and w.isalpha()])

content2 = []
for i in cleaned_content: 
    content2.append(' '.join(i))
    
vec = CountVectorizer()
content_vec = vec.fit_transform(content2)
df = pd.DataFrame(content_vec.toarray(), columns = vec.get_feature_names())


### Create Distance Matrix and Scale to 2D

In [6]:
m = mds(n_components=2,dissimilarity='precomputed') 

dist_mat = distance_matrix(df,df)

fnl = m.fit_transform(dist_mat)

fnl_df = pd.DataFrame(fnl)
fnl_df['Page'] = returned_pages
fnl_df['Url'] = url
fnl_df.columns = ['X','Y','Page','Url']



### Get addition information for viz

In [22]:
max_len = 0
min_len = 1000000000
len_list = []

for i in content:
    x = len(i)
    len_list.append(x)
    if x > max_len: 
        max_len = x
    if x < min_len:
        min_len = x
        
range = max_len - min_len
norm_len = []

for i in len_list:
    norm_len.append( ( (i-min_len)/range ) * 50 )

    
fnl_df['n_Article_Length'] = norm_len
fnl_df['Article_Length'] = len_list
fnl_df.columns = ['X','Y','Page','Url','n_Article_Length','Article_Length']

### Visualize with bokeh

In [52]:
viz_title = 'Wikipedia articles for \'' + search_term + '\' Hover to see Page. Click to go to the Wikipedia Page.'

output_file('Lacrosse.html')
plot=figure(plot_width=1000, plot_height=1000, tools='tap',  title = viz_title , toolbar_location='below')
plot.circle(x='X',y='Y', color = 'black', size= 'n_Article_Length',source=fnl_df)
plot.add_tools(HoverTool(
    tooltips=[
        ('Page: ', '@Page'),
        ('URL: ', '@Url'),
        ('Article Length: ', '@Article_Length')
    ]
))
w_url = "@Url"
taptool=plot.select(type=TapTool)
taptool.callback=OpenURL(url=w_url)
show(plot)
save(plot)

'D:\\Projects\\Python\\Wikipedia_Lacrosse_Pages\\Lacrosse.html'

### Re-process text for only the beginning of the page

In [71]:
cleaned_content = []

stop_words = set(stopwords.words('english'))

rt = RegexpTokenizer(r'\w+')

ps = PorterStemmer()

for i in content: 
    cleaned_content.append([ps.stem(w) for w in rt.tokenize(i.lower()) if not w in stop_words and w.isalpha()])

content2 = []
for i in cleaned_content: 
    temp = ' '.join(i)
    temp = temp[:2000]
    content2.append(temp)
    
vec = CountVectorizer()
content_vec = vec.fit_transform(content2)
df = pd.DataFrame(content_vec.toarray(), columns = vec.get_feature_names())


### Re-process distance and scaling

In [72]:
m = mds(n_components=2,dissimilarity='precomputed') 

dist_mat = distance_matrix(df,df)

fnl = m.fit_transform(dist_mat)

fnl_df = pd.DataFrame(fnl)
fnl_df['Page'] = returned_pages
fnl_df['Url'] = url
fnl_df.columns = ['X','Y','Page','Url']



### Re-process additional info

In [73]:
max_len = 0
min_len = 1000000000
len_list = []

for i in content:
    x = len(i)
    len_list.append(x)
    if x > max_len: 
        max_len = x
    if x < min_len:
        min_len = x
        
range = max_len - min_len
norm_len = []

for i in len_list:
    norm_len.append( ( (i-min_len)/range ) * 50 )

    
fnl_df['n_Article_Length'] = norm_len
fnl_df['Article_Length'] = len_list
fnl_df.columns = ['X','Y','Page','Url','n_Article_Length','Article_Length']

### Visualize Again

In [74]:
viz_title = 'Wikipedia articles for \'' + search_term + '\' Hover to see Page. Click to go to the Wikipedia Page.'

output_file('Lacrosse.html')
plot=figure(plot_width=1000, plot_height=1000, tools='tap',  title = viz_title , toolbar_location='below')
plot.circle(x='X',y='Y', color = 'black', size= 'n_Article_Length',source=fnl_df)
plot.add_tools(HoverTool(
    tooltips=[
        ('Page: ', '@Page'),
        ('URL: ', '@Url'),
        ('Article Length: ', '@Article_Length')
    ]
))
w_url = "@Url"
taptool=plot.select(type=TapTool)
taptool.callback=OpenURL(url=w_url)
show(plot)
#save(plot)

### Results look much better. Page content length was significantly impacting distance metrics. 