In [2]:
try:
    import json
    import os

    import pandas as  pd
    import spacy

    import seaborn as sns
    import string

    from tqdm import tqdm
    from textblob import TextBlob

    from nltk.corpus import stopwords
    import nltk
    from nltk.stem import WordNetLemmatizer
    from nltk import word_tokenize
    import re


    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.pipeline import Pipeline


    from sklearn.preprocessing import FunctionTransformer
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.pipeline import FeatureUnion
    from sklearn.feature_extraction import DictVectorizer

    import swifter

    tqdm.pandas()
except Exception as e:
    print("Error : {} ".format(e))

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [4]:
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)

In [5]:
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [6]:
df['category'].value_counts().plot( kind='bar', figsize=(15,10))

In [7]:
#df = df.head(6000)


In [8]:
df.columns


In [9]:
df.describe()


In [10]:
df.isna().sum()


In [11]:
df.head(2)


In [12]:
df['category'].unique()


In [13]:
# Used this snippets of code from
# https://github.com/ArmandDS/news_category/blob/master/News_Analysis_AO.ipynb

stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()
my_sw = ['make', 'amp',  'news','new' ,'time', 'u','s', 'photos',  'get', 'say']

def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2 and token not in my_sw

def clean_txt(text):
    clean_text = []
    clean_text2 = []
    text = re.sub("'", "",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
    clean_text2 = [word for word in clean_text if black_txt(word)]
    return " ".join(clean_text2)

In [14]:
def subj_txt(text):
    return  TextBlob(text).sentiment[1]

def polarity_txt(text):
    return TextBlob(text).sentiment[0]

def len_text(text):
    if len(text.split())>0:
         return len(set(clean_txt(text).split()))/ len(text.split())
    else:
         return 0

In [15]:
df['text'] = df['headline']  +  " " + df['short_description']

df['text'] = df['text'].swifter.apply(clean_txt)
df['polarity'] = df['text'].swifter.apply(polarity_txt)
df['subjectivity'] = df['text'].swifter.apply(subj_txt)
df['len'] = df['text'].swifter.apply(lambda x: len(x))

In [16]:
X = df[['text', 'polarity', 'subjectivity','len']]
y =df['category']

encoder = LabelEncoder()
y = encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
v = dict(zip(list(y), df['category'].to_list()))

In [17]:
text_clf = Pipeline([
...     ('vect', CountVectorizer(analyzer="word", stop_words="english")),
...     ('tfidf', TfidfTransformer(use_idf=True)),
...     ('clf', MultinomialNB(alpha=.01)),
... ])

In [18]:
text_clf.fit(x_train['text'].to_list(), list(y_train))


In [19]:
import numpy as np


In [20]:
X_TEST = x_test['text'].to_list()
Y_TEST = list(y_test)

In [21]:
predicted = text_clf.predict(X_TEST)


In [22]:
c = 0

for doc, category in zip(X_TEST, predicted):

    if c == 2:break

    print("-"*55)
    print(doc)
    print(v[category])
    print("-"*55)

    c = c + 1


In [23]:
np.mean(predicted == Y_TEST)


In [24]:
docs_new = ['Ten Months After George Floyd’s Death, Minneapolis Residents Are at War Over Policing']


In [25]:
predicted = text_clf.predict(docs_new)


In [26]:
v[predicted[0]]


In [27]:
import pickle
with open('classify_model.pkl','wb') as f:
    pickle.dump(text_clf,f)


In [28]:
# load
with open('classify_model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [29]:
docs_new = ['Please do your magic✨and help me find a room in a flat in Sattva Greenage, Bangalore.']
predicted = clf2.predict(docs_new)


In [30]:
v[predicted[0]]

In [31]:
print(v)