# News Category Finder using NLP


Please check the test, train and sample submission files
#### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import nltk

In [4]:
train = pd.read_excel('Data_Train.xlsx', sheet_name='Sheet1')
test = pd.read_excel('Data_Test.xlsx', sheet_name='Sheet1')

In [5]:
train.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [6]:
test.head()

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [7]:
train.describe()

Unnamed: 0,SECTION
count,7628.0
mean,1.357892
std,0.999341
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,3.0


In [8]:
train.groupby('SECTION').describe()

Unnamed: 0_level_0,STORY,STORY,STORY,STORY
Unnamed: 0_level_1,count,unique,top,freq
SECTION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1686,1673,This story has been published from a wire agen...,4
1,2772,2731,This story has been published from a wire agen...,13
2,1924,1914,"The consensus reads, “Exciting, entertaining, ...",3
3,1246,1233,This story has been published from a wire agen...,11


#### Tokenize

In [5]:
from nltk.tokenize import word_tokenize

In [12]:
from nltk.corpus import stopwords

In [16]:
from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()

word = "running" 
print(lem.lemmatize(word))

print(stem.stem(word))


running
run


In [17]:
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer 
lem = WordNetLemmatizer()
stem = PorterStemmer()

### Processing Tokens

In [6]:
import string

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    token = word_tokenize(nopunc)
    
    # Now just remove any stopwords
    no_noise = [word for word in token if word.lower() not in (stopwords.words('english') and ['"',"'",'`','”','“'])]
    # Stemming
    
    
    return no_noise
#[stem.stem(word.lower()) for word in no_noise]
    

In [12]:
train['STORY'].apply(text_process)

0       [painful, huge, reversal, fee, income, unheard...
1       [formidable, opposition, alliance, among, Cong...
2       [Asian, currencies, trading, lower, today, Sou...
3       [want, answer, question, click, ‘, Answer, ’, ...
4       [global, markets, gold, prices, edged, today, ...
5       [BEIJING, Chinese, tech, giant, Huawei, announ...
6       [Mumbai, India, Incs, external, commercial, bo...
7       [Wednesday, Federal, Reserve, Chairman, Jerome...
8       [give, audience, already, done, Yeh, Hai, Aash...
9       [com, Arbaaz, Khan, spoke, getting, back, Daba...
10      [“, One, would, think, development, testing, p...
11      [far, year, rupee, gained, 07, foreign, invest...
12      [Xiaomi, however, sees, presence, Jio, rural, ...
13      [ad, reads, bells, whistles, Bezel, notch, app...
14      [Tuesday, Powell, said, healthy, US, economy, ...
15      [feature, help, make, display, responsive, int...
16      [TikTok, popular, among, children, facing, cri...
17      [compa

### Bag of words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(train['STORY'])

In [15]:
print(len(bow_transformer.vocabulary_))

44346


In [16]:
bow4 = bow_transformer.transform([train['STORY'][4]])
print(bow4)   # vectors pointing from origin
print(bow4.shape)

  (0, 98)	1
  (0, 458)	1
  (0, 4354)	1
  (0, 6127)	1
  (0, 7903)	1
  (0, 17114)	1
  (0, 19913)	1
  (0, 20711)	1
  (0, 21236)	1
  (0, 22284)	1
  (0, 23796)	1
  (0, 24786)	1
  (0, 25194)	1
  (0, 25607)	1
  (0, 26393)	1
  (0, 26400)	1
  (0, 26889)	1
  (0, 27416)	1
  (0, 28063)	1
  (0, 28740)	2
  (0, 28793)	2
  (0, 29335)	1
  (0, 32289)	3
  (0, 33750)	1
  (0, 34213)	1
  (0, 34771)	1
  (0, 35747)	1
  (0, 37546)	1
  (0, 37647)	1
  (0, 39826)	1
  (0, 41132)	2
  (0, 42868)	1
(1, 44346)


In [17]:
print(bow_transformer.get_feature_names()[32289])
print(bow_transformer.get_feature_names()[39826])

markets
stock


In [18]:
messages_bow = bow_transformer.transform(train['STORY'])

In [19]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (7628, 44346)
Amount of Non-Zero occurences:  417825


In [20]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))

print(f'sparsity: {sparsity}')

sparsity: 0.12351772521704532


In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)

# TEST
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 42868)	0.1653840117317503
  (0, 41132)	0.23264193296874802
  (0, 39826)	0.12464706628982292
  (0, 37647)	0.13059926000708844
  (0, 37546)	0.13533155309101896
  (0, 35747)	0.11016931263373622
  (0, 34771)	0.11197548409940615
  (0, 34213)	0.17507399610069765
  (0, 33750)	0.2329734713609702
  (0, 32289)	0.3411710195006078
  (0, 29335)	0.14164721721626908
  (0, 28793)	0.2878801405286177
  (0, 28740)	0.21695807780745277
  (0, 28063)	0.1342731302228393
  (0, 27416)	0.1771020108713445
  (0, 26889)	0.13365789947306533
  (0, 26400)	0.20124735910228528
  (0, 26393)	0.1288535100738698
  (0, 25607)	0.18575848290189373
  (0, 25194)	0.20712745650428582
  (0, 24786)	0.09579350098100424
  (0, 23796)	0.130962730415073
  (0, 22284)	0.14220199953463913
  (0, 21236)	0.09531611135230537
  (0, 20711)	0.18575848290189373
  (0, 19913)	0.14959653531880082
  (0, 17114)	0.17929921522347367
  (0, 7903)	0.15233170401746512
  (0, 6127)	0.11539264864412509
  (0, 4354)	0.14394007026430886
  (0, 458)	0.243657958

In [22]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

7.74248747675401
8.33027414165613


In [23]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(7628, 44346)


In [24]:
messages_tfidf

<7628x44346 sparse matrix of type '<class 'numpy.float64'>'
	with 417825 stored elements in Compressed Sparse Row format>

In [26]:
#classifier
from sklearn.naive_bayes import MultinomialNB  
category_detect_model = MultinomialNB().fit(messages_tfidf, train['SECTION'])

In [28]:
print('predicted:', category_detect_model.predict(tfidf4)[0])
print('expected:', train['SECTION'][3])

predicted: 3
expected: 1


In [29]:
all_predictions = category_detect_model.predict(messages_tfidf)
print(all_predictions)

[3 0 3 ... 1 0 2]


In [31]:
from sklearn.metrics import classification_report
print (classification_report(train['SECTION'], all_predictions))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1686
           1       0.95      1.00      0.97      2772
           2       1.00      0.96      0.98      1924
           3       1.00      0.95      0.97      1246

    accuracy                           0.97      7628
   macro avg       0.98      0.97      0.97      7628
weighted avg       0.97      0.97      0.97      7628



### Using Pipeline

In [32]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [33]:
pipeline.fit(train['STORY'],train['SECTION'])

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x00000270C96390D0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [34]:
predictions = pipeline.predict(test['STORY'])
print(predictions)

[1 2 1 ... 1 0 1]


In [41]:
output = pd.DataFrame(predictions,columns=['SECTION'])
output

Unnamed: 0,SECTION
0,1
1,2
2,1
3,1
4,1
5,1
6,1
7,2
8,1
9,2


In [42]:
output.to_excel('output.xlsx',sheet_name='Sheet1',index=False)

In [39]:
output['STORY'] = test['STORY']
output

Unnamed: 0,SECTION,STORY
0,1,2019 will see gadgets like gaming smartphones ...
1,2,It has also unleashed a wave of changes in the...
2,1,It can be confusing to pick the right smartpho...
3,1,The mobile application is integrated with a da...
4,1,We have rounded up some of the gadgets that sh...
5,1,"""Imagine if every message you sent was kept wi..."
6,1,Positioned along the four sides of the Asus RO...
7,2,"In fact, when I applied to USC film school the..."
8,1,"As spotted by Android Police, Netflix is testi..."
9,2,Her moves were immaculately choreographed as s...


In [8]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,verbose=3)

In [10]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', grid),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [13]:
pipeline.fit(train['STORY'],train['SECTION'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.561, total=  30.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.0s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.561, total=  30.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.560, total=  29.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.391, total=  29.9s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.385, total=  29.3s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.387, total=  29.3s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total=  29.2s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total=  28.8s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.363, total=  29.3s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] .

[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.963, total=  28.3s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.958, total=  27.6s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV] ......... C=1000, gamma=1, kernel=rbf, score=0.964, total=  28.6s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.972, total=  15.1s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.964, total=  15.9s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV] ....... C=1000, gamma=0.1, kernel=rbf, score=0.971, total=  15.3s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.973, total=  14.4s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 29.5min finished


Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x000001A3A6C5AF28>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w...
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
 

In [14]:
predictions = pipeline.predict(test['STORY'])
output_svm_quote = pd.DataFrame(predictions,columns=['SECTION'])
output_svm_quote

Unnamed: 0,SECTION
0,1
1,2
2,1
3,1
4,1
5,1
6,1
7,2
8,1
9,2


In [20]:
predictions = pipeline.predict(test['STORY'])
output_svm = pd.DataFrame(predictions,columns=['SECTION'])
output_svm

Unnamed: 0,SECTION
0,1
1,2
2,1
3,1
4,1
5,1
6,1
7,2
8,1
9,2


In [15]:
output_svm_quote.to_excel('output_svm_quote.xlsx',sheet_name='Sheet1',index=False)

In [26]:
output_svm_skim.to_excel('output_svm_skim.xlsx',sheet_name='Sheet1',index=False)

In [24]:
token = ['sadva','vavb','"',"'",'`','is','what']
[word for word in token if word.lower() not in (stopwords.words('english') and ['"',"'",'`','”','“'])]

['sadva', 'vavb', 'is', 'what']