In [22]:
# Imports
import pandas
import requests
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup as bs
import re
from stop_words import sw
from utils import *
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np

## Define the url and the BeatifulSoup parser

In [2]:
URL = "https://en.wikipedia.org/wiki/The_Strokes"

html = requests.get(URL).text
soup = bs(html, features="html.parser")

In [3]:
# Agrego div content
divContent = soup.find("div",{"id":"content"})

In [4]:
# Get all text from div with id="content"
text = divContent.get_text()
# separate all words separetly
wordsFromText = re.split(" |,|\n|\[|\]",text)
pattern = "\n|,|\. |"+" | ".join(sw)[0:-1]
corpusFromText = re.split(pattern,text)
wordsFromTextFiltered = list(filter(lambda word:
                                  isValidWord(word)
                                  ,wordsFromText))
corpusFromTextFiltered = list(filter(lambda word:
                                  isValidWord(word)
                                  ,corpusFromText))
corpusFromTextFilteredSplitted = []
for corpus in corpusFromTextFiltered:
    spplitedCorpus = corpus.split(" ")
    spplitedCorpus = list(filter(lambda word:
                                  isValidWord(word)
                                  ,spplitedCorpus))
    corpusFromTextFilteredSplitted.append(spplitedCorpus)
# wordsFromTextFiltered[0:10]

In [5]:
# Initialize vectorizer with one gram words
vectorizer = CountVectorizer(stop_words='english',lowercase=False)
X = vectorizer.fit_transform(wordsFromTextFiltered)
featuresNames = vectorizer.get_feature_names()


In [6]:
# Third approach

countsPerWordOnCorpus = []

def getCoexistanceOnCorpus(wordA,wordB):
    count = 0
    for corpus in corpusFromTextFilteredSplitted:
        count += int(wordA in corpus and wordB in corpus)
    return count

for word in featuresNames:
    wordCoex = [word]
    for word2 in featuresNames:
        wordCoex.append(getCoexistanceOnCorpus(word,word2))
    countsPerWordOnCorpus.append(wordCoex)

In [11]:
columns = ['word']
columns.extend(featuresNames)
print(columns)
df = pandas.DataFrame(countsPerWordOnCorpus,columns=columns)
df = df.set_index('word')
df.head()

['word', '00', '000', '00s', '04', '06', '10', '100', '100th', '1036704585', '107', '109', '11', '118', '11th', '12', '13', '14', '15', '15th', '16', '17', '18', '19', '1970s', '1980s', '1991', '1995', '1996', '1997', '1998', '1998RCA', '1999', '20', '2000', '2000s', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2010s', '2011', '2011Commons', '2012', '2012Articles', '2013', '2014', '2015', '2016', '2017', '2017Articles', '2018', '2019', '2020', '2020s', '2021', '21', '21st', '22', '223309', '23', '24', '25', '26', '26th', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '487', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '63rd', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '978', '978006223312

Unnamed: 0_level_0,00,000,00s,04,06,10,100,100th,1036704585,107,...,worldwide,worst,worth,writing,written,wrote,year,years,yielded,young
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00s,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
def getScore(row):
    sumRow = np.nansum(row)
    countPositives = (row != 0).sum()
    dividend = countPositives - 1 if countPositives > 1 else 1

    return dividend if dividend == 1 else sumRow/dividend

df['score'] = df.apply(lambda row: getScore(row),axis=1)
df = df.sort_values(by=['score'],ascending=False)
df.head(10)

Unnamed: 0_level_0,00,000,00s,04,06,10,100,100th,1036704585,107,...,worst,worth,writing,written,wrote,year,years,yielded,young,score
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Impressions,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11.499805
Abnormal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.333324
Fighters,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.992188
Foo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.992188
Pitchfork,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.992188
time,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.992188
Age,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.749999
Earth,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.49989
Stone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.4
Angeles,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.994141


## This approach did not result well, the output was not tthe expected result
