# **1- DATA COLLECTION**

**Import packages**

In [20]:
%matplotlib inline
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
# LabelEncoder allows us to assign ordinal levels to categorical data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [3]:
df = pd.read_csv('newsdataset.csv')
df.head(10)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027
5,6,Plosser: Fed May Have to Accelerate Tapering Pace,http://www.nasdaq.com/article/plosser-fed-may-...,NASDAQ,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.nasdaq.com,1394470372212
6,7,Fed's Plosser: Taper pace may be too slow,http://www.marketwatch.com/story/feds-plosser-...,MarketWatch,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.marketwatch.com,1394470372405
7,8,Fed's Plosser expects US unemployment to fall ...,http://www.fxstreet.com/news/forex-news/articl...,FXstreet.com,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.fxstreet.com,1394470372615
8,9,US jobs growth last month hit by weather:Fed P...,http://economictimes.indiatimes.com/news/inter...,Economic Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,economictimes.indiatimes.com,1394470372792
9,10,ECB unlikely to end sterilisation of SMP purch...,http://www.iii.co.uk/news-opinion/reuters/news...,Interactive Investor,b,dPhGU51DcrolUIMxbRm0InaHGA2XM,www.iii.co.uk,1394470501265


In [4]:
df.CATEGORY.value_counts()

e    152469
b    115967
t    108344
m     45639
Name: CATEGORY, dtype: int64

In [5]:
import re
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

# **2- DATA PREPROCESSING**

In [6]:
df['TEXT'] = [normalize_text(s) for s in df['TITLE']]

In [7]:
df.columns

Index(['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME',
       'TIMESTAMP', 'TEXT'],
      dtype='object')

**Feature Extraction**

In [9]:
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['TEXT'])

In [11]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['CATEGORY'])

In [13]:
#split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# **4 - ML ALGORITHM**

In [15]:
# the Naive Bayes model
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB()

In [16]:
nb.score(x_test, y_test)

0.9257966005397472

In [21]:
# Logistic Regression
# Instantiate the classifier: clf

clf = OneVsRestClassifier(LogisticRegression())
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

OneVsRestClassifier(estimator=LogisticRegression())

In [22]:
# Print the accuracy
clf.score(x_test, y_test)

0.9475166895506841

# **5- SUMMARY OF PERFORMANCES**

In [None]:
# Assessment of Performance on other Dataset

# **5-1 PREDICTIONS BASED ON MOST REPRESENTED CATEGORIES**

In [18]:
# Function to predict category from a direct tittle:
def predict_categories(title):
    cat_names = {'b' : 'Business', 't' : 'Science and Technology', 'e' : 'Entertainment', 'm' : 'Health'}
    codes_p = nb.predict(vectorizer.transform([title]))
    return cat_names[encoder.inverse_transform(codes_p)[0]]

In [19]:
print(predict_categories("stocks are on the rise"))
print(predict_categories("chicken eggs and cholesterol"))
print(predict_categories("corporate tax cut"))
print(predict_categories("investigations of classified leaks"))

Business
Health
Business
Science and Technology


In [29]:
raw_data = {
        'news_titles': [
            'stocks are on the rise', 
            'chicken eggs and cholesterol', 
            'corporate tax cut', 
            'investigations of leaks', 
        'Here’s What to Do If You Find an Embarrassing Video of Yourself on Instagram or tiktok',
        'Twitter Bets on New Data Business Product to Revive Revenue',
        'It was a contreversial decision to start war in Syria'
        ],
        
    
    
    'categories_actual': ['Business', 'Health', 'Business', 'Science and Technology', 'Science and Technology','Science and Technology',
               'Entertainment']}

df_a = pd.DataFrame(raw_data, columns = ['news_titles', 'categories_actual'])
df_a

Unnamed: 0,news_titles,categories_actual
0,stocks are on the rise,Business
1,chicken eggs and cholesterol,Health
2,corporate tax cut,Business
3,investigations of leaks,Science and Technology
4,Here’s What to Do If You Find an Embarrassing Video of Yourself on Instagram or tiktok,Science and Technology
5,Twitter Bets on New Data Business Product to Revive Revenue,Science and Technology
6,It was a contreversial decision to start war in Syria,Entertainment


In [30]:
#pd.set_option("display.max_columns", 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
df_a["Categories_predicted"] = df_a["news_titles"].apply(predict_categories)
df_a

  pd.set_option('max_colwidth', -1)


Unnamed: 0,news_titles,categories_actual,Categories_predicted
0,stocks are on the rise,Business,Business
1,chicken eggs and cholesterol,Health,Health
2,corporate tax cut,Business,Business
3,investigations of leaks,Science and Technology,Science and Technology
4,Here’s What to Do If You Find an Embarrassing Video of Yourself on Instagram or tiktok,Science and Technology,Entertainment
5,Twitter Bets on New Data Business Product to Revive Revenue,Science and Technology,Business
6,It was a contreversial decision to start war in Syria,Entertainment,Business


# **6- DEPLOYMENT USING STREAMLIT**