In [1]:
import pandas as pd 
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hemantsah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv("combined_cat_news_data.csv")

In [3]:
data.head()

Unnamed: 0,category,headline,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ..."


In [4]:
data.shape

(181140, 3)

In [5]:
data['description'] = data['headline']+' '+data['short_description']

In [6]:
data['description'][0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV She left her husband. He killed their children. Just another day in America.'

In [7]:
df = data.copy()

In [8]:
df.head()

Unnamed: 0,category,headline,short_description,description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...


In [9]:
df.drop(['headline','short_description'], axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,category,description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [11]:
ps = PorterStemmer()
corpus = []

for i in range(0, int(len(df)/4)):
    news = re.sub('[^a-zA-Z]', ' ', df['description'][i])
    news = news.lower()
    news = news.split()
    
    news = [ps.stem(word) for word in news if not word in stopwords.words('english')]
    news = ' '.join(news)
    corpus.append(news)

In [12]:
corpus

['mass shoot texa last week tv left husband kill children anoth day america',
 'smith join diplo nicki jam world cup offici song cours song',
 'hugh grant marri first time age actor longtim girlfriend anna eberstein tie knot civil ceremoni',
 'jim carrey blast castrato adam schiff democrat new artwork actor give dem ass kick fight hard enough donald trump',
 'julianna marguli use donald trump poop bag pick dog dietland actress said use bag realli cathart therapeut moment',
 'morgan freeman devast sexual harass claim could undermin legaci right equat horrif incid sexual assault misplac compliment humor said statement',
 'donald trump lovin new mcdonald jingl tonight show bit catchi right',
 'watch amazon prime new week great mini seri join week',
 'mike myer reveal like fourth austin power film myer kid may push new power film anyon',
 'watch hulu new week get recent academi award win movi',
 'justin timberlak visit texa school shoot victim pop star also wore santa fe strong shirt show 

In [13]:
len(corpus)

45285

In [14]:
df['category'].nunique()

18

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['encoded_cat'] = le.fit_transform(df['category'])

In [16]:
df

Unnamed: 0,category,description,encoded_cat
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,2
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,4
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 5...,4
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,4
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,4
...,...,...,...
181135,SCIENCE,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,9
181136,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...,10
181137,SPORTS,"Giants Over Patriots, Jets Over Colts Among M...",10
181138,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...,10


In [17]:
df['encoded_cat'].value_counts()

15    30649
7     29578
4     20162
13    14962
6     12232
11    11204
17    10183
12     9408
5      8157
14     7077
1      6783
10     4221
9      3856
0      3221
16     3102
2      2675
8      1857
3      1813
Name: encoded_cat, dtype: int64

In [18]:
# Creating bag of models

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=17000)
X = cv.fit_transform(corpus).toarray()

In [19]:
X[0]

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
len(X[0])

17000

In [21]:
y = df['encoded_cat'][0:45285]

In [22]:
len(y)

45285

In [23]:
# cv.get_feature_names()

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [25]:
del X
del y
del corpus


In [None]:
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB().fit(X_train, y_train)


In [None]:
y_pred=NB.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score


In [None]:
accuracy_score(y_pred, y_test)