In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
# from nltk.tokenize import sent_tokenize
# from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
STOPWORDS=set(stopwords.words('english'))

In [2]:
news_df=pd.read_json('News_Category_Dataset.json',lines=True)
news_df.head()

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."


In [3]:
# combining some raw data categories that are very close
news_df.category=news_df.category.map(lambda x:"WORLDPOST" if x=="THE WORLDPOST" else x)
news_df.category=news_df.category.map(lambda y:"ARTS & CULTURE" if y=="ARTS" or y=="ARTS & CULTURE" else y)
news_df.category=news_df.category.map(lambda z:"EDUCATION" if z=="EDUCATION" or z=="COLLEGE" else z)

# removing rows with text size less than 10
mask=news_df['short_description'].str.len()>20
news_df=news_df.loc[mask]
# combines headline and short_description for the input
news_df['text']=news_df.headline+" "+news_df.short_description
# replace empty values with NaN
news_df=news_df.replace("",np.NaN)
# drop rows with value as NaN
news_df.dropna(axis=0,inplace=True)

# news_df.head()

In [4]:
# removing bad symbols,stopwords from text and lemmatizing each words in the text. 
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text=text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
    return text
    
news_df['text'] = news_df['text'].apply(clean_text)
news_df['text'] = news_df['text'].str.replace('\d+','')
# len(news_df['text'])

In [5]:
from io import StringIO
col = ['category', 'text']
# news_df = news_df.reindex(columns=col)


news_df = news_df[col]
news_df = news_df[pd.notnull(news_df['text'])]

news_df.columns = ['category', 'text']
news_df['category_id'] = news_df['category'].factorize()[0]
category_id_df = news_df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
# print(category_to_id)
news_df.head()

Unnamed: 0,category,text,category_id
0,CRIME,mass shooting texas last week tv left husban...,0
1,ENTERTAINMENT,smith join diplo nicky jam world cup official...,1
2,ENTERTAINMENT,hugh grant marries first time age actor longt...,1
3,ENTERTAINMENT,jim carrey blast castrato adam schiff democrat...,1
4,ENTERTAINMENT,julianna margulies us donald trump poop bag pi...,1


In [6]:
news_df = news_df.set_index("category")
news_df = news_df.drop(['WORLD NEWS','IMPACT','QUEER VOICES','LATINO VOICES','BLACK VOICES','FIFTY',
                        'WEIRD NEWS','ENTERTAINMENT','GREEN','GOOD NEWS','COMEDY','PARENTS','WOMEN',
                       'POLITICS','MEDIA','HEALTHY LIVING'], axis=0)
news_df=news_df.replace("",np.NaN)
# drop rows with value as NaN
news_df.dropna(axis=0,inplace=True)

In [7]:
# reset the index, thus categry is added as acolumn and set a new index.
news_df=news_df.reset_index("category")
# cat_list=news_df.category.tolist()
# new_list=list(dict.fromkeys(cat_list))
# new_list

In [15]:
# to get the info about our news dataframe
# news_df.describe()

news_df.category.value_counts()

WORLDPOST         3419
BUSINESS          3059
SPORTS            2891
ARTS & CULTURE    2117
RELIGION          1738
TASTE             1735
EDUCATION         1731
CRIME             1643
TRAVEL            1555
STYLE             1221
TECH               899
SCIENCE            879
Name: category, dtype: int64

In [8]:
tfidf=TfidfVectorizer(sublinear_tf=True, min_df=5, encoding='latin-1', ngram_range=(1, 2))
features=tfidf.fit_transform(news_df.text).toarray()
labels=news_df.category_id
features.shape

(22887, 13214)

In [9]:
category_id_df = news_df[['category', 'category_id']].sort_values('category_id')
category_to_id = dict(category_id_df.values)

In [14]:
from sklearn.feature_selection import chi2
N=2
for category,category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(category))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))


# 'ARTS & CULTURE':
  . Most correlated unigrams:
. artist
. art
  . Most correlated bigrams:
. stage door
. first nighter
# 'BUSINESS':
  . Most correlated unigrams:
. company
. business
  . Most correlated bigrams:
. business qa
. woman business
# 'CRIME':
  . Most correlated unigrams:
. cop
. police
  . Most correlated bigrams:
. police said
. police say
# 'EDUCATION':
  . Most correlated unigrams:
. college
. student
  . Most correlated bigrams:
. college student
. higher education
# 'HEALTHY LIVING':
  . Most correlated unigrams:
. cancer
. health
  . Most correlated bigrams:
. mental health
. gps guide
# 'MEDIA':
  . Most correlated unigrams:
. news
. fox
  . Most correlated bigrams:
. donald trump
. fox news
# 'RELIGION':
  . Most correlated unigrams:
. pope
. christian
  . Most correlated bigrams:
. pope francis
. daily meditation
# 'SCIENCE':
  . Most correlated unigrams:
. nasa
. scientist
  . Most correlated bigrams:
. scientist say
. solar system
# 'SPORTS':
  . Most correl

In [10]:
# train the classifier
# Naive Bayes Classifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(news_df['text'],
                                                    news_df['category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [11]:
# make predictions
print(clf.predict(count_vect.transform([
'unusual asteroid could interstellar guest solar system supposed interstellar immigrant located near jupiter atypical orbit'
])))

['SCIENCE']


In [None]:
from sklearn.svm import LinearSVC
model=LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    features, labels, news_df.index, test_size=0.33, random_state=0
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)




In [None]:
# from sklearn.metrics import confusion_matrix
# conf_mat=confusion_matrix(y_test, y_pred)

# fig, ax = plt.subplots(figsize=(10,10))
# sns.heatmap(conf_mat, annot=True, fmt='d',
#             xticklabels=category_id_df.category.values, 
#             yticklabels=category_id_df.category.values)
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()

In [None]:
# print the classification report for each class and the accuracy score
from sklearn import metrics
from sklearn.metrics import accuracy_score
print(metrics.classification_report(y_test, y_pred,
                                    target_names=news_df['category'].unique()))
print('accuracy %s' % accuracy_score(y_pred, y_test))