In [1]:
import pandas as pd
import re
import numpy as np

# Task 1

In [2]:
df = pd.read_csv('NewsCategorizer.csv')

df = df[['category', 'short_description']]

In [3]:
df

Unnamed: 0,category,short_description
0,WELLNESS,Resting is part of training. I've confirmed wh...
1,WELLNESS,Think of talking to yourself as a tool to coac...
2,WELLNESS,The clock is ticking for the United States to ...
3,WELLNESS,"If you want to be busy, keep trying to be perf..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a..."
...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...
49996,SPORTS,"Never change, young man. Never change."
49997,SPORTS,Wallace was hit with a first technical for a h...
49998,SPORTS,They believe CBD could be an alternative to po...


# Task 2

In [4]:
my_punctuation = '!"$%&#()*+,-./:;<=>?[\\]^_`{|}~•'
    
def preprocess(text_string):
    space_pattern = '\s+'
    new_line = '\n+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    non_word_char = '[^\w]'
    underscore = '_[\w]+'
    
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(new_line, ' ', parsed_text)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(non_word_char, ' ', parsed_text)
    parsed_text = re.sub(r"\bو(.*?)\b", r'\1', parsed_text)
    parsed_text = re.sub('([0-9]+)', '', parsed_text)
    parsed_text = re.sub(underscore, ' ', parsed_text)
    
    return parsed_text


In [5]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [6]:
from nltk.corpus import stopwords
stop_words = list(set(stopwords.words('english')))
    
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/imenekolli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/imenekolli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
df['tidy'] = np.vectorize(preprocess)(df['short_description'])
df['tidy'] = np.vectorize(deEmojify)(df['tidy'] )

df['tidy'] = df['tidy'].str.strip()
df['tidy'] = df['tidy'].apply(lambda x: str.lower(x))
df['tidy'] = df['tidy'].apply(lambda x: x.split())

df['tidy'] = df['tidy'].apply(lambda x: list(map(lambda y: lemma.lemmatize(y, pos ="n"), x)))
df['tidy'] = df['tidy'].apply(lambda x: list(map(lambda y: lemma.lemmatize(y, pos ="v"), x)))

df['tidy'] = df['tidy'].apply(lambda x: list(filter(lambda y: not y in stop_words, x)))

df['tidy'] = df['tidy'].apply(lambda x: list(filter(lambda y: not len(y) < 3 , x)))

In [8]:
df.tidy[14583]

['draw',
 'barrymore',
 'announce',
 'joyous',
 'news',
 'week',
 'pregnant',
 'second',
 'child',
 'similar',
 'baby',
 'news',
 'gwen',
 'stefani']

In [9]:
df

Unnamed: 0,category,short_description,tidy
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[rest, part, train, confirm, sort, already, kn..."
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talk, tool, coach, challenge, narrate,..."
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, tick, unite, state, find, cure, team, ..."
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, try, perfect, want, happy, ..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corn, beef, be..."
...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, piss, see, minor, league, team, of..."
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]"
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l..."
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai..."


# Task 3

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer(
    use_idf=True
    )

In [12]:
df['tidy_untokenized'] = df['tidy'].apply(lambda x: ' '.join([w for w in x]))

In [13]:
tfidf = tfidf_vectorizer.fit_transform(df['tidy_untokenized'])
tfidf.shape

(50000, 31222)

In [14]:
bow = pd.DataFrame(tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
bow

Unnamed: 0,aaa,aafa,aam,aamer,aan,aanav,aap,aarhus,aaron,aarp,...,état,étienne,être,île,övertorneå,öztürk,über,łowicz,ﬁrst,ﬂavors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
feature_array = np.array(tfidf_vectorizer.get_feature_names_out())

In [16]:
len(list(filter(lambda x: x in stop_words, tfidf_vectorizer.get_feature_names_out())))

0

# Task 4

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans

In [18]:
df['category'].unique()[0]

'WELLNESS'

In [19]:
classes = {}
for i in range(len(df['category'].unique())):
    classes[df['category'].unique()[i]] = i 

classes

{'WELLNESS': 0,
 'POLITICS': 1,
 'ENTERTAINMENT': 2,
 'TRAVEL': 3,
 'STYLE & BEAUTY': 4,
 'PARENTING': 5,
 'FOOD & DRINK': 6,
 'WORLD NEWS': 7,
 'BUSINESS': 8,
 'SPORTS': 9}

In [20]:
df['class'] = df['category'].map(classes)

In [21]:
df

Unnamed: 0,category,short_description,tidy,tidy_untokenized,class
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[rest, part, train, confirm, sort, already, kn...",rest part train confirm sort already know buil...,0
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talk, tool, coach, challenge, narrate,...",think talk tool coach challenge narrate experi...,0
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, tick, unite, state, find, cure, team, ...",clock tick unite state find cure team work stu...,0
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, try, perfect, want, happy, ...",want busy keep try perfect want happy focus ma...,0
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corn, beef, be...",first bad news soda bread corn beef beer highl...,0
...,...,...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, piss, see, minor, league, team, of...",many fan piss see minor league team offensive ...,9
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]",never change young man never change,9
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l...",wallace hit first technical hard foul luis sco...,9
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai...",believe cbd could alternative potent painkille...,9


In [22]:
from sklearn.feature_selection import SelectPercentile, chi2

new_tfidf = SelectPercentile(chi2, percentile=30).fit_transform(tfidf, df['class'])
new_tfidf.shape


(50000, 9367)

In [30]:
parameters = {'n_clusters': [10], 'n_init': [10,15, 50], 'max_iter': [50, 100]}
model = KMeans()


kmeans = GridSearchCV(model, parameters, scoring= 'adjusted_rand_score' , refit=True)

kmeans.fit(new_tfidf, df['class'])

In [31]:
kmeans.best_params_

{'max_iter': 50, 'n_clusters': 10, 'n_init': 10}

In [32]:
kmeans.best_score_

0.011621429473942492

In [36]:
model = KMeans(n_clusters= 10, n_init= 10, max_iter= 50).fit(new_tfidf)

In [37]:
model.labels_

array([4, 1, 4, ..., 4, 4, 4], dtype=int32)

In [39]:
df['label'] = model.labels_

df

Unnamed: 0,category,short_description,tidy,tidy_untokenized,class,label
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[rest, part, train, confirm, sort, already, kn...",rest part train confirm sort already know buil...,0,4
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talk, tool, coach, challenge, narrate,...",think talk tool coach challenge narrate experi...,0,1
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, tick, unite, state, find, cure, team, ...",clock tick unite state find cure team work stu...,0,4
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, try, perfect, want, happy, ...",want busy keep try perfect want happy focus ma...,0,1
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corn, beef, be...",first bad news soda bread corn beef beer highl...,0,7
...,...,...,...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, piss, see, minor, league, team, of...",many fan piss see minor league team offensive ...,9,4
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]",never change young man never change,9,1
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l...",wallace hit first technical hard foul luis sco...,9,4
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai...",believe cbd could alternative potent painkille...,9,4


In [40]:
df['label'].unique()

array([4, 1, 7, 0, 6, 2, 9, 5, 3, 8], dtype=int32)

In [41]:
def top_cluster_words(n, k):
    return feature_array[np.argsort(bow.loc[df[df['label'] == k].index].to_numpy()).flatten()[::-1]][:n]

In [42]:
top_cluster_words(5, 1)

array(['never', 'change', 'man', 'young', 'frank'], dtype=object)

In [43]:
dictionary = {}
for i in df['label'].unique():
    dictionary[i] = top_cluster_words(5, i).tolist()

dictionary

{4: ['gymnast', 'league', 'ﬂavors', 'francesca', 'fraiche'],
 1: ['never', 'change', 'man', 'young', 'frank'],
 7: ['gotta', 'sort', 'guy', 'bad', 'feel'],
 0: ['breast', 'nfl', 'cancer', 'matter', 'lose'],
 6: ['trade', 'rashid', 'hopeful', 'prospect', 'williams'],
 2: ['popovich', 'blond', 'beg', 'monday', 'fun'],
 9: ['rice', 'league', 'return', 'ready', 'ﬂavors'],
 5: ['annoy', 'busy', 'score', 'non', 'fan'],
 3: ['shrek', 'transformer', 'twilight', 'avenger', 'knight'],
 8: ['awesome', 'via', 'taste', 'tumblr', 'pinterest']}

In [44]:
len(dictionary)

10

In [45]:
df["top_words"] = df["label"].map(dictionary)

In [46]:
df

Unnamed: 0,category,short_description,tidy,tidy_untokenized,class,label,top_words
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[rest, part, train, confirm, sort, already, kn...",rest part train confirm sort already know buil...,0,4,"[gymnast, league, ﬂavors, francesca, fraiche]"
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talk, tool, coach, challenge, narrate,...",think talk tool coach challenge narrate experi...,0,1,"[never, change, man, young, frank]"
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, tick, unite, state, find, cure, team, ...",clock tick unite state find cure team work stu...,0,4,"[gymnast, league, ﬂavors, francesca, fraiche]"
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, try, perfect, want, happy, ...",want busy keep try perfect want happy focus ma...,0,1,"[never, change, man, young, frank]"
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corn, beef, be...",first bad news soda bread corn beef beer highl...,0,7,"[gotta, sort, guy, bad, feel]"
...,...,...,...,...,...,...,...
49995,SPORTS,Many fans were pissed after seeing the minor l...,"[many, fan, piss, see, minor, league, team, of...",many fan piss see minor league team offensive ...,9,4,"[gymnast, league, ﬂavors, francesca, fraiche]"
49996,SPORTS,"Never change, young man. Never change.","[never, change, young, man, never, change]",never change young man never change,9,1,"[never, change, man, young, frank]"
49997,SPORTS,Wallace was hit with a first technical for a h...,"[wallace, hit, first, technical, hard, foul, l...",wallace hit first technical hard foul luis sco...,9,4,"[gymnast, league, ﬂavors, francesca, fraiche]"
49998,SPORTS,They believe CBD could be an alternative to po...,"[believe, cbd, could, alternative, potent, pai...",believe cbd could alternative potent painkille...,9,4,"[gymnast, league, ﬂavors, francesca, fraiche]"


In [47]:
df['category'].unique()

array(['WELLNESS', 'POLITICS', 'ENTERTAINMENT', 'TRAVEL',
       'STYLE & BEAUTY', 'PARENTING', 'FOOD & DRINK', 'WORLD NEWS',
       'BUSINESS', 'SPORTS'], dtype=object)

In [48]:
dictionary

{4: ['gymnast', 'league', 'ﬂavors', 'francesca', 'fraiche'],
 1: ['never', 'change', 'man', 'young', 'frank'],
 7: ['gotta', 'sort', 'guy', 'bad', 'feel'],
 0: ['breast', 'nfl', 'cancer', 'matter', 'lose'],
 6: ['trade', 'rashid', 'hopeful', 'prospect', 'williams'],
 2: ['popovich', 'blond', 'beg', 'monday', 'fun'],
 9: ['rice', 'league', 'return', 'ready', 'ﬂavors'],
 5: ['annoy', 'busy', 'score', 'non', 'fan'],
 3: ['shrek', 'transformer', 'twilight', 'avenger', 'knight'],
 8: ['awesome', 'via', 'taste', 'tumblr', 'pinterest']}

In [50]:
pd.set_option('display.max_rows', 500)

In [52]:
df.groupby(['class', 'label']).agg( classes = ('category', 'max'), words = ('top_words', 'max'), descriptions = ('label', 'count'))

Unnamed: 0_level_0,Unnamed: 1_level_0,classes,words,descriptions
class,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,WELLNESS,"[breast, nfl, cancer, matter, lose]",249
0,1,WELLNESS,"[never, change, man, young, frank]",1790
0,2,WELLNESS,"[popovich, blond, beg, monday, fun]",104
0,4,WELLNESS,"[gymnast, league, ﬂavors, francesca, fraiche]",2639
0,5,WELLNESS,"[annoy, busy, score, non, fan]",87
0,6,WELLNESS,"[trade, rashid, hopeful, prospect, williams]",20
0,7,WELLNESS,"[gotta, sort, guy, bad, feel]",92
0,9,WELLNESS,"[rice, league, return, ready, ﬂavors]",19
1,0,POLITICS,"[breast, nfl, cancer, matter, lose]",29
1,1,POLITICS,"[never, change, man, young, frank]",564
