In [61]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import re

In [62]:
df = pd.read_csv('NewsCategorizer.csv')

df = df[['category', 'short_description']]

In [63]:
df

Unnamed: 0,category,short_description
0,WELLNESS,Resting is part of training. I've confirmed wh...
1,WELLNESS,Think of talking to yourself as a tool to coac...
2,WELLNESS,The clock is ticking for the United States to ...
3,WELLNESS,"If you want to be busy, keep trying to be perf..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a..."
...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...
49996,SPORTS,"Never change, young man. Never change."
49997,SPORTS,Wallace was hit with a first technical for a h...
49998,SPORTS,They believe CBD could be an alternative to po...


# Task 2

In [64]:
my_punctuation = '!"$%&#()*+,-./:;<=>?[\\]^_`{|}~•'
    
def preprocess(text_string):
    space_pattern = '\s+'
    new_line = '\n+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    non_word_char = '[^\w]'
    underscore = '_[\w]+'
    
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(new_line, ' ', parsed_text)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(non_word_char, ' ', parsed_text)
    parsed_text = re.sub(r"\bو(.*?)\b", r'\1', parsed_text)
    parsed_text = re.sub('([0-9]+)', '', parsed_text)
    parsed_text = re.sub(underscore, ' ', parsed_text)
    
    return parsed_text


In [65]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [66]:
from nltk.corpus import stopwords
stop_words = list(set(stopwords.words('english')))
    
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/imenekolli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/imenekolli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [67]:
df['tidy'] = np.vectorize(preprocess)(df['short_description'])
df['tidy'] = np.vectorize(deEmojify)(df['tidy'] )

df['tidy'] = df['tidy'].str.strip()
df['tidy'] = df['tidy'].apply(lambda x: str.lower(x))
df['tidy'] = df['tidy'].apply(lambda x: x.split())

df['tidy'] = df['tidy'].apply(lambda x: list(map(lambda y: lemma.lemmatize(y, pos ="n"), x)))

df['tidy'] = df['tidy'].apply(lambda x: list(filter(lambda y: not y in stop_words, x)))

df['tidy'] = df['tidy'].apply(lambda x: list(filter(lambda y: not len(y) < 3 , x)))

In [68]:
df.tidy[14583]

['drew',
 'barrymore',
 'announced',
 'joyous',
 'news',
 'week',
 'pregnant',
 'second',
 'child',
 'similar',
 'baby',
 'news',
 'gwen',
 'stefani']

In [69]:
df

Unnamed: 0,category,short_description,tidy
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[resting, part, training, confirmed, sort, alr..."
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talking, tool, coach, challenge, narra..."
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, ticking, united, state, find, cure, te..."
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, trying, perfect, want, happ..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corned, beef, ..."
...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, pissed, seeing, minor, league, tea..."
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]"
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l..."
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai..."


# Task 3

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
tfidf_vectorizer = TfidfVectorizer(
    use_idf=True
    )

In [72]:
df['tidy_untokenized'] = df['tidy'].apply(lambda x: ' '.join([w for w in x]))

In [73]:
tfidf = tfidf_vectorizer.fit_transform(df['tidy_untokenized'])
tfidf.shape

(50000, 36869)

In [74]:
bow = pd.DataFrame(tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
bow

Unnamed: 0,aaa,aafa,aam,aamer,aan,aanav,aap,aarhus,aaron,aarp,...,état,étienne,être,île,övertorneå,öztürk,über,łowicz,ﬁrst,ﬂavors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
feature_array = np.array(tfidf_vectorizer.get_feature_names_out())

In [76]:
len(list(filter(lambda x: x in stop_words, tfidf_vectorizer.get_feature_names_out())))

0

In [77]:
df

Unnamed: 0,category,short_description,tidy,tidy_untokenized
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[resting, part, training, confirmed, sort, alr...",resting part training confirmed sort already k...
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talking, tool, coach, challenge, narra...",think talking tool coach challenge narrate exp...
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, ticking, united, state, find, cure, te...",clock ticking united state find cure team work...
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, trying, perfect, want, happ...",want busy keep trying perfect want happy focus...
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corned, beef, ...",first bad news soda bread corned beef beer hig...
...,...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, pissed, seeing, minor, league, tea...",many fan pissed seeing minor league team offen...
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]",never change young man never change
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l...",wallace hit first technical hard foul luis sco...
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai...",believe cbd could alternative potent painkille...


In [78]:
classes = {}
for i in range(len(df['category'].unique())):
    classes[df['category'].unique()[i]] = i 

classes

{'WELLNESS': 0,
 'POLITICS': 1,
 'ENTERTAINMENT': 2,
 'TRAVEL': 3,
 'STYLE & BEAUTY': 4,
 'PARENTING': 5,
 'FOOD & DRINK': 6,
 'WORLD NEWS': 7,
 'BUSINESS': 8,
 'SPORTS': 9}

In [79]:
df['class'] = df['category'].map(classes)

# LDA

In [88]:
from gensim.models.ldamodel import LdaModel
from gensim import corpora

In [89]:
df

Unnamed: 0,category,short_description,tidy,tidy_untokenized,class
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[resting, part, training, confirmed, sort, alr...",resting part training confirmed sort already k...,0
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talking, tool, coach, challenge, narra...",think talking tool coach challenge narrate exp...,0
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, ticking, united, state, find, cure, te...",clock ticking united state find cure team work...,0
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, trying, perfect, want, happ...",want busy keep trying perfect want happy focus...,0
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corned, beef, ...",first bad news soda bread corned beef beer hig...,0
...,...,...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, pissed, seeing, minor, league, tea...",many fan pissed seeing minor league team offen...,9
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]",never change young man never change,9
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l...",wallace hit first technical hard foul luis sco...,9
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai...",believe cbd could alternative potent painkille...,9


In [92]:

texts = df['tidy']

dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

In [96]:
lda_model = LdaModel(corpus = corpus,
                      num_topics = 10,
                      id2word = dictionary,
                      passes=15)

lda_model.show_topics()

[(0,
  '0.029*"fan" + 0.017*"record" + 0.015*"pay" + 0.015*"girl" + 0.015*"baby" + 0.014*"series" + 0.014*"photo" + 0.014*"medium" + 0.013*"daughter" + 0.012*"video"'),
 (1,
  '0.014*"economy" + 0.013*"coach" + 0.012*"nba" + 0.012*"read" + 0.010*"perfect" + 0.009*"shot" + 0.008*"team" + 0.008*"wine" + 0.007*"corporate" + 0.007*"medal"'),
 (2,
  '0.016*"state" + 0.011*"people" + 0.010*"said" + 0.008*"year" + 0.008*"american" + 0.008*"president" + 0.007*"leader" + 0.007*"company" + 0.007*"two" + 0.006*"say"'),
 (3,
  '0.022*"game" + 0.017*"football" + 0.014*"business" + 0.014*"league" + 0.012*"team" + 0.011*"athlete" + 0.010*"employee" + 0.009*"twitter" + 0.009*"olympic" + 0.007*"reuters"'),
 (4,
  '0.018*"country" + 0.017*"world" + 0.016*"nfl" + 0.015*"player" + 0.012*"win" + 0.011*"second" + 0.008*"america" + 0.008*"history" + 0.008*"major" + 0.008*"first"'),
 (5,
  '0.042*"year" + 0.021*"new" + 0.019*"old" + 0.012*"first" + 0.012*"former" + 0.010*"week" + 0.010*"game" + 0.008*"want" +

In [99]:
## Visualize the lda_model:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = Loose