In [58]:
import nltk
import pandas as pd

In [49]:
from collections import Counter

In [91]:
from nltk.corpus import stopwords
from sklearn.metrics import f1_score

In [4]:
data = pd.read_csv(r"D:\Data\wikipedia-ml\wikipedia_machine_learning.csv",sep='\t')

The format of the data is that each row is a series, and each series has 3 elements: 0:title, 1: url, 2: body text 

In [5]:
articles = data.apply(lambda x: x[2], axis=1)

In [6]:
titles = data.apply(lambda x: x[0], axis=1)

In [7]:
urls = data.apply(lambda x: x[1], axis=1)

In [8]:
df = pd.concat([titles,urls,articles],axis=1)

In [9]:
df.columns = ['title', 'url','original_article']

In [10]:
df.head()

Unnamed: 0,title,url,original_article
0,Outline of computer vision,https://en.wikipedia.org/wiki/Outline_of_compu...,The following outline is provided as an overvi...
1,Outline of natural language processing,https://en.wikipedia.org/wiki/Outline_of_natur...,The following outline is provided as an overvi...
2,Outline of robotics,https://en.wikipedia.org/wiki/Outline_of_robotics,The following outline is provided as an overvi...
3,Accuracy paradox,https://en.wikipedia.org/wiki/Accuracy_paradox,The accuracy paradox is the paradoxical findin...
4,Action model learning,https://en.wikipedia.org/wiki/Action_model_lea...,Action model learning(sometimes abbreviated ac...


The next stage is to apply typical NLP preprocessing steps before looking at what words tend to be used often:
- lowercase
- remove stopwords

In [11]:
list(set([type(a) for a in df.original_article]))

[float, str]

In [12]:
float_cols = df[df['original_article'].apply(lambda x: isinstance(x, float))]

In [13]:
float_cols.head()

Unnamed: 0,title,url,original_article
100,Category:Robotics suites,https://en.wikipedia.org/wiki/Category:Robotic...,
5449,Category:Search algorithms,https://en.wikipedia.org/wiki/Category:Search_...,


In [14]:
df['original_article'].isna().sum()

2

There's not many nulls, I'm just going to drop them, I'd check the proportion of nulls, but since the number of instances is in the thousands, and this is 2, I'm just going to drop them

In [15]:
df = df.dropna(how='any')
df.shape

(7316, 3)

In [16]:
articles = df['original_article'].apply(lambda s: s.lower())

I need to create a dataframe with a column for each word, where the row is the count of that word in that article

In [17]:
articles = articles.apply(lambda row: nltk.word_tokenize(row))

In [18]:
articles.head()

0    [the, following, outline, is, provided, as, an...
1    [the, following, outline, is, provided, as, an...
2    [the, following, outline, is, provided, as, an...
3    [the, accuracy, paradox, is, the, paradoxical,...
4    [action, model, learning, (, sometimes, abbrev...
Name: original_article, dtype: object

In [19]:
stop_words = set(stopwords.words('english')) 

I was conservative with removing the punctuation, because some of it might be important, like ? especially, and ==

In [20]:
punctuation = "()'',.:"

I need to remove punctuation as well as stopwords

In [21]:
def remove_unwanted(tokens):
    tokens = [w for w in t if w not in stop_words and w not in punctuation]
    return tokens

In [22]:
punctuation

"()'',.:"

In [23]:
a = articles.loc[0]

In [24]:
articles = articles.apply(lambda x: [w for w in x if w not in stop_words and w not in punctuation])

In [25]:
print(articles.shape)
print(df.shape)

(7316,)
(7316, 3)


In [26]:
c = Counter(a)

In [27]:
#df.drop('article',axis=1)
df['article']=articles

next: sort columns by count frequency

next make a dataframe with the most frequent words

Something like this https://www.aclweb.org/anthology/W15-1526.pdf would be good to try - supposedly outperforms LDA, but

In [58]:
df['length'] = articles.apply(lambda x: len(x))

In [59]:
df['length'].describe()

count     7316.000000
mean      1324.216785
std       1505.295536
min          4.000000
25%        346.000000
50%        808.000000
75%       1711.000000
max      17054.000000
Name: length, dtype: float64

With this, discarding articles with less than a threshold of non-stopwords will help train an algorithm

In [29]:
import topicgetter

In [30]:
model, terms, topic_dict = topicgetter.make_topics(articles, 20)

shape:  (7316, 1000)


In [None]:
type(terms)

In [None]:
len(terms)

In [None]:
len(model.components_)

In [None]:
topic_dict = {}

In [None]:
for i, comp in enumerate(model.components_):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
        #print("Topic "+str(i)+": ")
        specific_terms = []
        for t in sorted_terms:
            #print(t[0])
            #print(" ")
            specific_terms.append(t[0])
        
        topic_dict["Topic"+str(i)] = specific_terms

In [None]:
for i in range(5):
    topic_dict["Topic"+str(i)] = 0

In [None]:
topic_dict

# Get some topics per article

In [73]:
df['topics'] = df.title.copy().apply(lambda x: ''.join([c for c in x if c not in punctuation]))

In [74]:
df['topics'] = df.title.copy().apply(lambda x: x.lower())

In [75]:
df.topics = df.topics.copy().apply(lambda x: nltk.word_tokenize(x))

In [76]:
df.topics = df.topics.copy().apply(lambda x: [w for w in x if w not in stop_words])

In [78]:
df.topics =  df.topics.copy().apply(lambda x: [w.lower() for w in x])

this is going to be really simple: just see which of the terms extracted are in an article, divide by number of words in article

In [79]:
terms

['000',
 '10',
 '100',
 '11',
 '12',
 '15',
 '16',
 '20',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '3d',
 '978',
 'a_',
 'ability',
 'able',
 'academic',
 'access',
 'according',
 'account',
 'accuracy',
 'accurate',
 'act',
 'action',
 'actions',
 'active',
 'activities',
 'activity',
 'actual',
 'actually',
 'added',
 'addition',
 'additional',
 'address',
 'advanced',
 'age',
 'agent',
 'ai',
 'al',
 'algebra',
 'algorithm',
 'algorithms',
 'aligned',
 'allow',
 'allowed',
 'allowing',
 'allows',
 'alpha',
 'alternative',
 'american',
 'analysis',
 'appear',
 'application',
 'applications',
 'applied',
 'apply',
 'approach',
 'approaches',
 'appropriate',
 'approximation',
 'architecture',
 'area',
 'areas',
 'argued',
 'argument',
 'array',
 'art',
 'article',
 'artificial',
 'aspects',
 'associated',
 'association',
 'assumption',
 'attention',
 'audio',
 'auth

In [50]:
def get_terms_in_article(article):
    count = 0
    about = []
    for word in article:
        if word in terms:
            count+=1
            about.append(word)
    return about

In [81]:
def count_terms_in_article(article):
    count = 0
    about = []
    for word in article:
        if word in terms:
            count+=1
            about.append(word)
    return count

In [40]:
get_terms_in_article(articles[0])/len(articles[0])

0.5140997830802603

In [90]:
get_terms_in_article(articles[0])

['following',
 'provided',
 'computer',
 'vision',
 'computer',
 'vision',
 'field',
 'computers',
 'understanding',
 'digital',
 'images',
 'engineering',
 'tasks',
 'human',
 'visual',
 'computer',
 'vision',
 'tasks',
 'include',
 'methods',
 'digital',
 'images',
 'image',
 'image',
 'processing',
 'image',
 'analysis',
 'understanding',
 'digital',
 'images',
 'general',
 'real',
 'world',
 'order',
 'produce',
 'numerical',
 'information',
 'computer',
 'image',
 'forms',
 'video',
 'sequences',
 'multiple',
 'medical',
 'computer',
 'vision',
 'apply',
 'theories',
 'models',
 'construction',
 'computer',
 'vision',
 'systems',
 'scientific',
 'computer',
 'vision',
 'theory',
 'artificial',
 'systems',
 'information',
 'images',
 'computer',
 'computer',
 'computer',
 'history',
 'computer',
 'history',
 'computer',
 'computer',
 'vision',
 'image',
 'image',
 'image',
 'gamma',
 'fourier',
 'image',
 'image',
 'filter',
 'color',
 'visual',
 'human',
 'visual',
 'color',
 'mat

In [45]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [46]:
from gensim.test.utils import common_texts

In [51]:
a_terms = count_terms_in_article(articles[0])

In [52]:
c = Counter(a_terms)

In [55]:
sorted_terms = sorted(c.items(), key=lambda item: (-item[1], item[0]))

In [57]:
len(articles[0])

461

In [64]:
c.most_common(10)

[('computer', 36),
 ('vision', 22),
 ('image', 18),
 ('color', 8),
 ('visual', 6),
 ('digital', 4),
 ('images', 4),
 ('feature', 4),
 ('conference', 4),
 ('video', 3)]

from here, I'll see how many of the terms in the most common 10  are in the article titles

In [70]:
test_terms = [t[0] for t in list(c.most_common(10))]

In [72]:
test_terms

['computer',
 'vision',
 'image',
 'color',
 'visual',
 'digital',
 'images',
 'feature',
 'conference',
 'video']

In [80]:
df.topics[0]

['outline', 'computer', 'vision']

In [93]:
def get_score(row):
    article = row['article']
    terms = get_terms_in_article(article)
    c = Counter(a_terms)
    test_terms = c.most_common(10)
    
    return len(c)
    
    

0    111
1    111
2    111
3    111
4    111
dtype: int64