In [30]:
#import spacy and load the language model downloaded
import spacy
nlp = spacy.load('en_core_web_lg')

In [31]:
#import pandas library

import pandas as pd

#read the dataset "news_dataset.json" provided and load it into dataframe "df"

df = pd.read_json('news_dataset (1).json')

#print the shape of data
print(df.shape)

#print the top5 rows
df.head()

(7500, 2)


Unnamed: 0,text,category
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS
3,This Richard Sherman Interception Literally Sh...,SPORTS
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS


In [32]:
#check the distribution of labels 
df.category.value_counts()

category
CRIME       2500
SPORTS      2500
BUSINESS    2500
Name: count, dtype: int64

In [33]:

#Add the new column "label_num" which gives a unique number to each of these labels 

df['label_num'] = df.category.map({
    'BUSINESS' :0 ,
    'SPORTS':1,
    'CRIME':2,
    'SCIENCE':3
})

#check the results with top 5 rows
df.head()

Unnamed: 0,text,category,label_num
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,2
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,2
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1
3,This Richard Sherman Interception Literally Sh...,SPORTS,1
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,0


In [34]:
#1. Remove the stop words
#2. Convert to base form using lemmatisation

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [35]:

#create a new column "preprocessed_text" which store the clean form of given text [use apply and lambda function]

df['preprocessed_text'] = df.text.apply(lambda text : preprocess(text))

In [36]:
#print the top 5 rows
df.head()

Unnamed: 0,text,category,label_num,preprocessed_text
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,2,Larry Nassar blame victim say victimize newly ...
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,2,woman Beats Cancer die fall horse
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1,vegas taxpayer spend Record $ 750 million New ...
3,This Richard Sherman Interception Literally Sh...,SPORTS,1,Richard Sherman Interception literally shake W...
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,0,7 thing totally kill Weed Legalization Buzz


In [37]:
#create a new column "vector" that store the vector representation of each pre-processed text
df['vector'] = df.preprocessed_text.apply(lambda text : nlp(text).vector)

#print the top 5 rows
df.head()

Unnamed: 0,text,category,label_num,preprocessed_text,vector
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,2,Larry Nassar blame victim say victimize newly ...,"[-0.5585511, -0.29323253, -0.9253956, 0.189389..."
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,2,woman Beats Cancer die fall horse,"[-0.73039824, -0.43196002, -1.2930516, -1.0628..."
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1,vegas taxpayer spend Record $ 750 million New ...,"[-1.9413117, 0.121578515, -3.2996283, 1.511650..."
3,This Richard Sherman Interception Literally Sh...,SPORTS,1,Richard Sherman Interception literally shake W...,"[-1.4702771, -0.685319, 0.57398, -0.31135806, ..."
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,0,7 thing totally kill Weed Legalization Buzz,"[-1.037173, -1.9495698, -1.7179357, 1.2975286,..."


In [38]:
# train_test_split
from sklearn.model_selection import train_test_split


In [39]:
X = df.vector.values
y = df.label_num
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

In [43]:
X_train.shape,y_train.shape

((6000,), (6000,))

In [52]:
# import numpy as np

import numpy as np

#reshapes the X_train and X_test using 'stack' function of numpy. Store the result in new variables "X_train_2d" and "X_test_2d"

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [53]:
X_train_2d.shape,y_train.shape

((6000, 300), (6000,))

# Attempt 1:

use spacy glove embeddings for text vectorization.

use Decision Tree as the classifier.

print the classification report.

In [54]:
from sklearn.tree import DecisionTreeClassifier


#1. creating a Decision Tree model object

model = DecisionTreeClassifier()

#2. fit with all_train_embeddings and y_train

model.fit(X_train_2d,y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred

d_pre = model.predict(X_test_2d)

#4. print the classfication report

from sklearn.metrics import classification_report
print(classification_report(y_test, d_pre))

              precision    recall  f1-score   support

           0       0.77      0.74      0.76       535
           1       0.71      0.73      0.72       479
           2       0.72      0.74      0.73       486

    accuracy                           0.73      1500
   macro avg       0.73      0.73      0.73      1500
weighted avg       0.74      0.73      0.73      1500



# Attempt 2:

use spacy glove embedding for text vectorizer

use multinimilaNB as the classifier after applying the minmaxscaler.

print the classification report

In [57]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report


#doing scaling because Negative values will not pass into Naive Bayes models
scaler = MinMaxScaler()                                         
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

#1. creating a MultinomialNB model object 
clf = MultinomialNB()

#2. fit with all_train_embeddings and y_train
clf.fit(scaled_train_embed , y_train) 


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(scaled_test_embed)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.81      0.83       535
           1       0.82      0.84      0.83       479
           2       0.85      0.87      0.86       486

    accuracy                           0.84      1500
   macro avg       0.84      0.84      0.84      1500
weighted avg       0.84      0.84      0.84      1500



(6000,)