In [24]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [25]:
#import news dataset
news_data = pd.read_csv('uci-news-aggregator.csv')


In [26]:
news_data.columns

Index(['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME',
       'TIMESTAMP'],
      dtype='object')

In [27]:
#create a new data frame with only columns necessary
news_data = news_data[['TITLE','CATEGORY']].copy()
#reaname columns for ease
news_data.rename(columns={'TITLE':'headline',
                          'CATEGORY':'category',
                         }, 
                 inplace=True)

In [28]:
#using only the first 15000 data
news_data.head()

Unnamed: 0,headline,category
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


In [29]:
#find all the unique categories
news_data['category'].unique()

array(['b', 't', 'e', 'm'], dtype=object)

In [30]:
#define le as LabelEncoder to encode the categories
le = preprocessing.LabelEncoder()

In [31]:
#encode category
news_data['category'] = le.fit_transform(news_data['category'])

In [32]:
news_data['category'].head(15)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
Name: category, dtype: int32

In [33]:
#print labels assigened to categories
list(le.inverse_transform([0,1, 2, 3]))

['b', 'e', 'm', 't']

In [34]:
print(news_data['headline'])

0         Fed official says weak data caused by weather,...
1         Fed's Charles Plosser sees high bar for change...
2         US open: Stocks fall after Fed official hints ...
3         Fed risks falling 'behind the curve', Charles ...
4         Fed's Plosser: Nasty Weather Has Curbed Job Gr...
                                ...                        
422414    Surgeons to remove 4-year-old's rib to rebuild...
422415    Boy to have surgery on esophagus after battery...
422416    Child who swallowed battery to have reconstruc...
422417    Phoenix boy undergoes surgery to repair throat...
422418    Phoenix boy undergoes surgery to repair throat...
Name: headline, Length: 422419, dtype: object


In [35]:
#seperating labels from data into new series
news_labels=news_data['category']

In [36]:
def removeNonAscii(s): return "".join(i for i in s if ord(i)>=0 and ord(i)<=128)

In [37]:
#import libraries and initiate required objects for processing
tokenizer = nltk.tokenize.TreebankWordTokenizer()
stop_words = stopwords.words("english")
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
tfidf = TfidfVectorizer(min_df =2,max_df=0.5,ngram_range=(1,2))
def process_text(text):
    #convert text to lowercase
    text= text.apply(lambda x:x.lower())
    #remove multiple spaces from text
    text= text.apply(lambda x:re.sub(' +', ' ',x))
    # remove alpha numeric characeter from text using regex
    text= text.apply(lambda x:x.lower())
    text.str.replace(r"\d+", "")
    text.str.replace('[^\w\s]','')
    text.str.replace(r"[︰-＠]", "")
    text.str.replace(r"", "")
    text.str.replace('\d+', '')
    #tokenize hadline
    text= text.apply(removeNonAscii)
    text = text.apply(tokenizer.tokenize)
    #remove english stop words form text
    text =text.apply(lambda x: [item for item in x if item not in stop_words])
    #stemming text
    text =text.apply(lambda x: [stemmer.stem(e) for e in x])
    #stripping spaces from items of array of texts
    text=text.apply(lambda x: [e.strip() for e in x])
    #remove all the characters from array of texts if the length of item is 1
    text=text.apply( lambda x: [ y for y in x if len(y)>1 ])
    #remove all the digits from the text
    text=text.apply( lambda x: [ y for y in x if not y.isdigit()] )
    # lemattizing text
    text = text.apply(lambda x: ' '.join(lemmatizer.lemmatize(token) for token in x))
    return text
    #encoding features using tfidf
    





In [38]:
#process news headline 
news_data = process_text(news_data['headline'])


In [39]:
news_data

0          fed offici say weak data caus weather slow taper
1         fed 's charl plosser see high bar chang pace t...
2           u open stock fall fed offici hint acceler taper
3               fed risk fall behind curv charl plosser say
4              fed 's plosser nasti weather curb job growth
                                ...                        
422414    surgeon remov 4-year-old 's rib rebuild damag ...
422415       boy surgeri esophagus batteri burn hole throat
422416    child swallow batteri reconstruct surgeri cinc...
422417    phoenix boy undergo surgeri repair throat dama...
422418    phoenix boy undergo surgeri repair throat dama...
Name: headline, Length: 422419, dtype: object

In [40]:
tfidf = TfidfVectorizer(min_df =2,max_df=0.5,ngram_range=(1,2))
tfidf = tfidf.fit(news_data)
news_data = tfidf.transform(news_data)
#news_data= pd.DataFrame(
    #news_data.todense(),
    #columns=tfidf.get_feature_names())

In [41]:
news_data.head()

AttributeError: head not found

In [42]:
#check the shape of labels and data if they have same shape
news_data.shape,news_labels.shape

((422419, 310528), (422419,))

In [43]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(news_data, news_labels, test_size=0.33)

In [44]:
X_train.head()

AttributeError: head not found

In [45]:
#importing required libraries
from sklearn.metrics import accuracy_score

In [None]:
#using SVM
from sklearn import svm
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)
predictions_SVM = SVM.predict(X_test)
print("Acuuracy: ",accuracy_score(predictions_SVM, y_test)*100)

In [None]:
# using Naive Bayes
# Naive = naive_bayes.MultinomialNB()
# Naive.fit(X_train,y_train)
# # predictions_NB = Naive.predict(X_test)
# print("Acuuracy: ",accuracy_score(predictions_NB, y_test)*100)

In [46]:
#using logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
predictions_clf = clf.predict(X_test)
print('Accuracy:',accuracy_score(predictions_clf, y_test)*100)




Accuracy: 94.56954497521502


In [None]:
import requests 
from bs4 import BeautifulSoup
import json
import datetime



url = "https://www.nytimes.com/"
r = requests.get(url)
now = datetime.datetime.now()
now = now.strftime('%A, %B %d, %Y  %I:%M %p')

r_html = r.text
soup = BeautifulSoup(r_html, "html.parser")

scripts = soup.find_all('script')
for script in scripts:
    if 'preloadedData' in script.text:
        jsonStr = script.text
        jsonStr = jsonStr.split('=', 1)[1].strip()
        jsonStr = jsonStr.rsplit(';', 1)[0]
        jsonObj = json.loads(jsonStr)


print ('%s\nHeadlines\n%s\n' %(url, now))
headline_array=[]
for ele, v in jsonObj['initialState'].items():
    try:
        if v['headline'] and v['__typename'] == 'PromotionalProperties':
            if (v['headline'] !='' and v['headline']!= 'none'):
                print(v)
                headline_array.append(v['headline'])
    except:
        continue
print(headline_array)

In [None]:
headline_df = pd.Series(headline_array)
headline_df.head()

In [None]:

headline_df = process_text(headline_df)

In [None]:
print(headline_df)

In [None]:
headline_df = tfidf.transform(headline_df)
headline_df= pd.DataFrame(
    headline_df.todense(),
    columns=tfidf.get_feature_names())

In [None]:
print(headline_df.shape)

In [None]:
headline_df


In [None]:
headline_df.head()

In [None]:
x = clf.predict(headline_df)

In [None]:
print(list(x))

In [None]:
names_replace_dict = {'b': 'Business',
                     'e':'Entertainment',
                     'm':'Health',
                      't':'Tech'
                }

In [None]:
def showCategory(headline_array,names_replace_dict,prediction):
    predicted_labels = le.inverse_transform(list(prediction))
    for i in range(0,len(headline_array)-1):
        print(f'{headline_array[i]} : {names_replace_dict[predicted_labels[i]]}')
    
        
    


In [None]:
showCategory(headline_array,names_replace_dict,x)