###  Data set Link
#### https://archive.ics.uci.edu/ml/datasets/News+Aggregator

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('news_data.csv',nrows=10000)

In [3]:
data.shape

(10000, 8)

In [4]:
data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [72]:
data[data['CATEGORY'] == 'e']

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
2169,2170,George Zimmerman Has an Armed Life on the Move,http://www.wltx.com/story/news/nation/2014/03/...,WLTX.com,e,d7RBEwyH92gFSrMjpl764nNfewB0M,www.wltx.com,1394517154092
2170,2171,George Zimmerman Signs Autographs At Florida G...,http://www.huffingtonpost.co.uk/2014/03/10/geo...,Huffington Post UK,e,d7RBEwyH92gFSrMjpl764nNfewB0M,www.huffingtonpost.co.uk,1394517154269
2171,2172,George Zimmerman Signed Autographs at an Orlan...,http://www.blacknews.com/news/george-zimmerman...,BlackNews.com \(press release\),e,d7RBEwyH92gFSrMjpl764nNfewB0M,www.blacknews.com,1394517154479
2172,2173,George Zimmerman back in controversy,http://www.wtxl.com/news/florida_news/george-z...,WTXL ABC 27,e,d7RBEwyH92gFSrMjpl764nNfewB0M,www.wtxl.com,1394517154639
2173,2174,George Zimmerman signs autographs at a Florida...,http://www.msnbc.com/the-last-word/zimmerman-s...,MSNBC,e,d7RBEwyH92gFSrMjpl764nNfewB0M,www.msnbc.com,1394517154831
...,...,...,...,...,...,...,...,...
7645,7646,PonoMusic to reportedly will kick off Kickstar...,http://www.talkingnewmedia.com/2014/03/10/pono...,Talking New Media,e,d4PJFa1co3Mku-MxjwCHHW7xLx1pM,www.talkingnewmedia.com,1394620447404
7646,7647,Pono music player to debut on Kickstarter,http://www.cnet.com.au/pono-music-player-to-de...,CNET Australia,e,d4PJFa1co3Mku-MxjwCHHW7xLx1pM,www.cnet.com.au,1394620447586
7647,7648,Neil Young's High Fidelity PonoPlayer To Launc...,http://www.ubergizmo.com/2014/03/neil-youngs-h...,Ubergizmo,e,d4PJFa1co3Mku-MxjwCHHW7xLx1pM,www.ubergizmo.com,1394620447780
7648,7649,Neil Young launching $399 high-fidelity audio ...,http://www.pocket-lint.com/news/127775-neil-yo...,Pocket-lint.com,e,d4PJFa1co3Mku-MxjwCHHW7xLx1pM,www.pocket-lint.com,1394620448042


In [5]:
news_data = data[['TITLE','PUBLISHER','CATEGORY']]

In [6]:
news_data.head()

Unnamed: 0,TITLE,PUBLISHER,CATEGORY
0,"Fed official says weak data caused by weather,...",Los Angeles Times,b
1,Fed's Charles Plosser sees high bar for change...,Livemint,b
2,US open: Stocks fall after Fed official hints ...,IFA Magazine,b
3,"Fed risks falling 'behind the curve', Charles ...",IFA Magazine,b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,Moneynews,b


In [7]:
news_data['CATEGORY'].value_counts()

e    3472
b    3449
t    2282
m     797
Name: CATEGORY, dtype: int64

In [8]:
news_data.isnull().sum()

TITLE        0
PUBLISHER    1
CATEGORY     0
dtype: int64

### Tokenization 

In [9]:
import string
punct = string.punctuation

In [None]:
pucnt

### Data Cleaning 

In [10]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [11]:
from spacy.lang.en.stop_words import STOP_WORDS

In [12]:
nlp = spacy.load('en')

In [13]:
stopwords = list(STOP_WORDS)

In [14]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [15]:
text_data_cleaning("  tis is the best in the Himansuh")

['tis', 'good', 'himansuh']

### Classification 

In [16]:
from sklearn.svm import LinearSVC

In [17]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC()

In [30]:
X = news_data['TITLE']
y = news_data['CATEGORY']

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=6)

In [32]:
X_train.shape, y_train.shape

((8000,), (8000,))

In [33]:
clf = Pipeline([('tfidf',tfidf),('clf',classifier)])

In [34]:
y_train.head()


8247    m
705     b
4451    b
4328    b
3882    e
Name: CATEGORY, dtype: object

In [35]:
clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

### one of: -- b : business -- t : science and technology -- e : entertainment -- m : health 

In [90]:
text = '''

'''

In [91]:
pred = clf.predict([text])
if pred == 'b':
    print("Business News")
elif pred == 't':
    print("Science and Technology")
elif pred == 'e':
    print("Entertainment")
elif pred == 'm':
    print("Health")

Health


In [51]:
accuracy_score(y_test,clf.predict(X_test))

0.9655

In [53]:
print(classification_report(y_test,clf.predict(X_test)))

              precision    recall  f1-score   support

           b       0.93      0.98      0.95       652
           e       1.00      0.98      0.99       709
           m       0.99      0.95      0.97       162
           t       0.96      0.93      0.94       477

   micro avg       0.97      0.97      0.97      2000
   macro avg       0.97      0.96      0.96      2000
weighted avg       0.97      0.97      0.97      2000



### Model Save 

In [66]:
from sklearn.externals import joblib

In [67]:
joblib.dump(clf,'news_classifier.pkl')

['news_classifier.pkl']