In [101]:
import nltk
import pandas as pd

In [102]:
from collections import Counter

In [103]:
from nltk.corpus import stopwords
from sklearn.metrics import f1_score

In [104]:
data = pd.read_csv(r"D:\Data\wikipedia-ml\wikipedia_machine_learning.csv",sep='\t')

The format of the data is that each row is a series, and each series has 3 elements: 0:title, 1: url, 2: body text 

In [105]:
articles = data.apply(lambda x: x[2], axis=1)

In [106]:
titles = data.apply(lambda x: x[0], axis=1)

In [107]:
urls = data.apply(lambda x: x[1], axis=1)

In [108]:
df = pd.concat([titles,urls,articles],axis=1)

In [109]:
df.columns = ['title', 'url','original_article']

In [110]:
df.head()

Unnamed: 0,title,url,original_article
0,Outline of computer vision,https://en.wikipedia.org/wiki/Outline_of_compu...,The following outline is provided as an overvi...
1,Outline of natural language processing,https://en.wikipedia.org/wiki/Outline_of_natur...,The following outline is provided as an overvi...
2,Outline of robotics,https://en.wikipedia.org/wiki/Outline_of_robotics,The following outline is provided as an overvi...
3,Accuracy paradox,https://en.wikipedia.org/wiki/Accuracy_paradox,The accuracy paradox is the paradoxical findin...
4,Action model learning,https://en.wikipedia.org/wiki/Action_model_lea...,Action model learning(sometimes abbreviated ac...


The next stage is to apply typical NLP preprocessing steps before looking at what words tend to be used often:
- lowercase
- remove stopwords

In [111]:
list(set([type(a) for a in df.original_article]))

[float, str]

In [112]:
float_cols = df[df['original_article'].apply(lambda x: isinstance(x, float))]

In [113]:
float_cols.head()

Unnamed: 0,title,url,original_article
100,Category:Robotics suites,https://en.wikipedia.org/wiki/Category:Robotic...,
5449,Category:Search algorithms,https://en.wikipedia.org/wiki/Category:Search_...,


In [114]:
df['original_article'].isna().sum()

2

There's not many nulls, I'm just going to drop them, I'd check the proportion of nulls, but since the number of instances is in the thousands, and this is 2, I'm just going to drop them

In [115]:
df = df.dropna(how='any')
df.shape

(7316, 3)

In [116]:
punctuation = "()'',.:-="

In [117]:
pun_trans = str.maketrans("", "", punctuation)

In [118]:
df['articles'] = df['original_article'].apply(lambda s: s.lower())

In [119]:
df.articles

0       the following outline is provided as an overvi...
1       the following outline is provided as an overvi...
2       the following outline is provided as an overvi...
3       the accuracy paradox is the paradoxical findin...
4       action model learning(sometimes abbreviated ac...
                              ...                        
7313    apollonian sphere packing is the three-dimensi...
7314    in mathematics, the hermite constant, named af...
7315    in geometry, a kissing number is defined as th...
7316    in geometry, a sphere packing is an arrangemen...
7317    random close packing(rcp) is an empirical para...
Name: articles, Length: 7316, dtype: object

In [120]:
df['articles'] = df.articles.copy().apply(lambda x: x.translate(pun_trans))

I need to create a dataframe with a column for each word, where the row is the count of that word in that article

In [121]:
df['articles'] = df.articles.copy().apply(lambda row: nltk.word_tokenize(row))

In [122]:
df.articles.head()

0    [the, following, outline, is, provided, as, an...
1    [the, following, outline, is, provided, as, an...
2    [the, following, outline, is, provided, as, an...
3    [the, accuracy, paradox, is, the, paradoxical,...
4    [action, model, learningsometimes, abbreviated...
Name: articles, dtype: object

In [123]:
stop_words = set(stopwords.words('english')) 

I was conservative with removing the punctuation, because some of it might be important, like ? especially, and ==

I need to remove punctuation as well as stopwords

In [124]:
df.articles = df.articles.copy().apply(lambda x: [w for w in x if w not in stop_words])

In [125]:
df.articles

0       [following, outline, provided, overview, topic...
1       [following, outline, provided, overview, topic...
2       [following, outline, provided, overview, topic...
3       [accuracy, paradox, paradoxical, finding, accu...
4       [action, model, learningsometimes, abbreviated...
                              ...                        
7313    [apollonian, sphere, packing, threedimensional...
7314    [mathematics, hermite, constant, named, charle...
7315    [geometry, kissing, number, defined, number, n...
7316    [geometry, sphere, packing, arrangement, nonov...
7317    [random, close, packingrcp, empirical, paramet...
Name: articles, Length: 7316, dtype: object

In [126]:
c = Counter(df.articles[0])

In [127]:
c.most_common(10)

[('computer', 36),
 ('vision', 26),
 ('image', 18),
 ('color', 8),
 ('visual', 6),
 ('digital', 4),
 ('feature', 4),
 ('optical', 4),
 ('conference', 4),
 ('outline', 3)]

Something like this https://www.aclweb.org/anthology/W15-1526.pdf would be good to try - supposedly outperforms LDA, but

In [128]:
def get_most_common(word_list):
    n = 10
    c = Counter(word_list)
    most_common = c.most_common(n)
    return [x[0] for x in most_common]

In [129]:
df['text_topics'] = df.articles.copy().apply(get_most_common)

In [130]:
df.articles

0       [following, outline, provided, overview, topic...
1       [following, outline, provided, overview, topic...
2       [following, outline, provided, overview, topic...
3       [accuracy, paradox, paradoxical, finding, accu...
4       [action, model, learningsometimes, abbreviated...
                              ...                        
7313    [apollonian, sphere, packing, threedimensional...
7314    [mathematics, hermite, constant, named, charle...
7315    [geometry, kissing, number, defined, number, n...
7316    [geometry, sphere, packing, arrangement, nonov...
7317    [random, close, packingrcp, empirical, paramet...
Name: articles, Length: 7316, dtype: object

# Get some topics per article

In [131]:
df['text_topics2'] = df.title.copy().apply(lambda x: ''.join([c for c in x if c not in punctuation]))

In [132]:
df['text_topics2'] = df.text_topics2.copy().apply(lambda x: x.lower())

In [133]:
df.text_topics2 = df.text_topics2.copy().apply(lambda x: nltk.word_tokenize(x))

In [134]:
df.text_topics2 = df.text_topics2.copy().apply(lambda x: [w for w in x if w not in stop_words])

In [135]:
df.columns

Index(['title', 'url', 'original_article', 'articles', 'text_topics',
       'text_topics2'],
      dtype='object')

## There are two Ozymandias solutions to getting topics - 

just get non-stopword words from the title 'text_topics2', or get non-stopword words from the article body 'text_topics'