In [2]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import re

In [3]:
df = pd.read_csv('NewsCategorizer.csv')

df = df[['category', 'short_description']]

In [4]:
df

Unnamed: 0,category,short_description
0,WELLNESS,Resting is part of training. I've confirmed wh...
1,WELLNESS,Think of talking to yourself as a tool to coac...
2,WELLNESS,The clock is ticking for the United States to ...
3,WELLNESS,"If you want to be busy, keep trying to be perf..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a..."
...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...
49996,SPORTS,"Never change, young man. Never change."
49997,SPORTS,Wallace was hit with a first technical for a h...
49998,SPORTS,They believe CBD could be an alternative to po...


# Task 2

In [5]:
my_punctuation = '!"$%&#()*+,-./:;<=>?[\\]^_`{|}~•'
    
def preprocess(text_string):
    space_pattern = '\s+'
    new_line = '\n+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    non_word_char = '[^\w]'
    underscore = '_[\w]+'
    
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(new_line, ' ', parsed_text)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(non_word_char, ' ', parsed_text)
    parsed_text = re.sub(r"\bو(.*?)\b", r'\1', parsed_text)
    parsed_text = re.sub('([0-9]+)', '', parsed_text)
    parsed_text = re.sub(underscore, ' ', parsed_text)
    
    return parsed_text


In [6]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [7]:
from nltk.corpus import stopwords
stop_words = list(set(stopwords.words('english')))
    
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/imenekolli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/imenekolli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
df['tidy'] = np.vectorize(preprocess)(df['short_description'])
df['tidy'] = np.vectorize(deEmojify)(df['tidy'] )

df['tidy'] = df['tidy'].str.strip()
df['tidy'] = df['tidy'].apply(lambda x: str.lower(x))
df['tidy'] = df['tidy'].apply(lambda x: x.split())

df['tidy'] = df['tidy'].apply(lambda x: list(map(lambda y: lemma.lemmatize(y, pos ="n"), x)))
df['tidy'] = df['tidy'].apply(lambda x: list(map(lambda y: lemma.lemmatize(y, pos ="v"), x)))
df['tidy'] = df['tidy'].apply(lambda x: list(map(lambda y: lemma.lemmatize(y, pos ="a"), x)))

df['tidy'] = df['tidy'].apply(lambda x: list(filter(lambda y: not y in stop_words, x)))

df['tidy'] = df['tidy'].apply(lambda x: list(filter(lambda y: not len(y) < 3 , x)))

In [9]:
df.tidy[14583]

['draw',
 'barrymore',
 'announce',
 'joyous',
 'news',
 'week',
 'pregnant',
 'second',
 'child',
 'similar',
 'baby',
 'news',
 'gwen',
 'stefani']

In [10]:
df

Unnamed: 0,category,short_description,tidy
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[rest, part, train, confirm, sort, already, kn..."
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talk, tool, coach, challenge, narrate,..."
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, tick, unite, state, find, cure, team, ..."
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, try, perfect, want, happy, ..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corn, beef, be..."
...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, piss, see, minor, league, team, of..."
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]"
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l..."
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai..."


# Task 3

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_vectorizer = TfidfVectorizer(
    use_idf=True
    )

In [13]:
df['tidy_untokenized'] = df['tidy'].apply(lambda x: ' '.join([w for w in x]))

In [14]:
tfidf = tfidf_vectorizer.fit_transform(df['tidy_untokenized'])
tfidf.shape

(50000, 31222)

In [15]:
bow = pd.DataFrame(tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
bow

Unnamed: 0,aaa,aafa,aam,aamer,aan,aanav,aap,aarhus,aaron,aarp,...,état,étienne,être,île,övertorneå,öztürk,über,łowicz,ﬁrst,ﬂavors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
feature_array = np.array(tfidf_vectorizer.get_feature_names_out())

In [17]:
len(list(filter(lambda x: x in stop_words, tfidf_vectorizer.get_feature_names_out())))

0

In [20]:
df

Unnamed: 0,category,short_description,tidy,tidy_untokenized
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[rest, part, train, confirm, sort, already, kn...",rest part train confirm sort already know buil...
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talk, tool, coach, challenge, narrate,...",think talk tool coach challenge narrate experi...
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, tick, unite, state, find, cure, team, ...",clock tick unite state find cure team work stu...
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, try, perfect, want, happy, ...",want busy keep try perfect want happy focus ma...
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corn, beef, be...",first bad news soda bread corn beef beer highl...
...,...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, piss, see, minor, league, team, of...",many fan piss see minor league team offensive ...
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]",never change young man never change
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l...",wallace hit first technical hard foul luis sco...
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai...",believe cbd could alternative potent painkille...


In [21]:
classes = {}
for i in range(len(df['category'].unique())):
    classes[df['category'].unique()[i]] = i 

classes

{'WELLNESS': 0,
 'POLITICS': 1,
 'ENTERTAINMENT': 2,
 'TRAVEL': 3,
 'STYLE & BEAUTY': 4,
 'PARENTING': 5,
 'FOOD & DRINK': 6,
 'WORLD NEWS': 7,
 'BUSINESS': 8,
 'SPORTS': 9}

In [22]:
df['class'] = df['category'].map(classes)

In [23]:
from sklearn.feature_selection import SelectPercentile, chi2

new_tfidf = SelectPercentile(chi2, percentile=30).fit_transform(tfidf, df['class'])
new_tfidf.shape

(50000, 9367)

# SVD 

In [24]:
# Define the number of topics or components
num_components=10

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
lsa.fit_transform(new_tfidf)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

In [25]:
# Print the topics with their terms
terms = feature_array

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['continent', 'expertise', 'elayna', 'bradshaw', 'church']
Topic 1:  ['armor', 'ethic', 'enculturate', 'bjergsø', 'durbin']
Topic 2:  ['expertise', 'contaminant', 'condescendingly', 'channing', 'bnbfinder']
Topic 3:  ['devine', 'couric', 'continent', 'existential', 'draft']
Topic 4:  ['bradshaw', 'devine', 'elayna', 'dazs', 'couric']
Topic 5:  ['continent', 'bradshaw', 'balmy', 'egocentric', 'encyclical']
Topic 6:  ['elayna', 'condescendingly', 'earth', 'bnbfinder', 'balmy']
Topic 7:  ['chinasmack', 'breathtaking', 'cheng', 'continent', 'devine']
Topic 8:  ['elayna', 'egon', 'expertise', 'centro', 'contaminant']
Topic 9:  ['elayna', 'church', 'devine', 'breathtaking', 'continent']


In [26]:
df['category'].unique()

array(['WELLNESS', 'POLITICS', 'ENTERTAINMENT', 'TRAVEL',
       'STYLE & BEAUTY', 'PARENTING', 'FOOD & DRINK', 'WORLD NEWS',
       'BUSINESS', 'SPORTS'], dtype=object)

# Using Genism

In [27]:
#import modules
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [28]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix


In [29]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [30]:
df

Unnamed: 0,category,short_description,tidy,tidy_untokenized,class
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[rest, part, train, confirm, sort, already, kn...",rest part train confirm sort already know buil...,0
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talk, tool, coach, challenge, narrate,...",think talk tool coach challenge narrate experi...,0
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, tick, unite, state, find, cure, team, ...",clock tick unite state find cure team work stu...,0
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, try, perfect, want, happy, ...",want busy keep try perfect want happy focus ma...,0
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corn, beef, be...",first bad news soda bread corn beef beer highl...,0
...,...,...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, piss, see, minor, league, team, of...",many fan piss see minor league team offensive ...,9
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]",never change young man never change,9
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l...",wallace hit first technical hard foul luis sco...,9
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai...",believe cbd could alternative potent painkille...,9


In [32]:
# LSA Model
number_of_topics=10
words=10
clean_text= df['tidy']
model=create_gensim_lsa_model(clean_text,number_of_topics,5)


[(0, '0.305*"one" + 0.277*"year" + 0.251*"time" + 0.236*"make" + 0.208*"get"'), (1, '0.799*"year" + -0.329*"one" + 0.222*"new" + -0.204*"make" + 0.181*"old"'), (2, '-0.850*"one" + 0.269*"make" + 0.199*"get" + -0.193*"year" + 0.179*"time"'), (3, '0.806*"make" + -0.404*"time" + -0.323*"get" + 0.197*"year" + -0.065*"day"'), (4, '0.692*"time" + -0.632*"get" + -0.209*"new" + 0.106*"make" + -0.085*"year"'), (5, '0.822*"new" + -0.307*"get" + -0.304*"year" + 0.171*"york" + -0.118*"day"'), (6, '-0.490*"get" + -0.364*"time" + 0.352*"like" + -0.335*"make" + 0.220*"say"'), (7, '-0.413*"want" + -0.362*"check" + -0.320*"sure" + -0.296*"twitter" + -0.291*"facebook"'), (8, '0.764*"day" + -0.366*"like" + 0.323*"take" + -0.195*"know" + -0.144*"time"'), (9, '-0.701*"child" + 0.398*"like" + -0.353*"parent" + -0.282*"life" + 0.185*"look"')]
