# import Section

In [8]:
import requests
import nltk
import pandas as pd
import numpy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation
from sklearn.metrics import accuracy_score
from sklearn import svm

# constant Section

In [2]:
#urls and topics
arts = "https://www.newsmaker.com.au/category/view/id/7/Release_page/"      #Art & Culture
sport = "https://www.newsmaker.com.au/category/view/id/149/Release_page/"   #Sport
food = "https://www.newsmaker.com.au/category/view/id/49/Release_page/"     #Food & Cuisine
travel = "https://www.newsmaker.com.au/category/view/id/185/Release_page/"  #Travel

# code Section

In [3]:
#to get page urls
def get_urls(current,count):
    urls = {}
    for kol in range(2,3000):
        if(count==0):
            break
        curr_url = current+str(kol)
        response = requests.get(curr_url)
        text_page = response.text
        n = len(text_page)
        res = 0
        for i in range(0,n):
            if(res==10):
                break
            if(text_page[i:i+13]=='<h3><a href="'):
                i+=13
                j = i
                while(text_page[i:i+2]!='">'):
                    i+=1
                new_url = "https://www.newsmaker.com.au"+text_page[j:i]
                if(new_url not in urls):
                    urls[new_url]=new_url
                    count-=1;
                else:
                    return urls
                if(count==0):
                    break
                res+=1
    return urls

In [4]:
#to delete HTML tags
def get_text(text):
    new_text = ""
    n = len(text)
    for i in range(0,n):
        if(text[i]=='>'):
            i+=1
            while(i<n and text[i]!='<'):
                new_text+=text[i]
                i+=1
    return new_text

In [5]:
#to get news text from page
def get_urls_text(urls):
    alltext={}
    for key in urls:
        url = urls[key]
        response = requests.get(url)
        text_page = response.text
        n = len(text_page)
        beg = -1
        the_end = -1
        for i in range(0,n):
            if(text_page[i:i+19]=='<div class="span8">'):
                beg = i+19
                i+=19
                while(text_page[i:i+6]!='</div>' and text_page[i:i+4]!='href'):
                    i+=1
                the_end = i
                break
        if(beg!=-1):
            curr_text = text_page[beg:the_end]
            curr_text = get_text(curr_text)
            if(len(curr_text)>200):
                alltext[key]=curr_text
    return alltext

In [6]:
#try to get 500 urls of pages from each category
arts_urls = get_urls(arts,500)
sport_urls = get_urls(sport,500)
food_urls = get_urls(food,500)
travel_urls = get_urls(travel,500)

In [7]:
#get news texts
arts_text = get_urls_text(arts_urls)
sport_text = get_urls_text(sport_urls)
food_text = get_urls_text(food_urls)
travel_text = get_urls_text(travel_urls)

In [8]:
#size of each category
print(len(arts_text))
print(len(sport_text))
print(len(food_text))
print(len(travel_text))

64
192
393
85


In [9]:
#to get dictionary with text and label
def add_list(fdict,mydict,target,size):
    for key in mydict:
        if(size==0):
            break
        size-=1
        fdict['text'].append(mydict[key])
        fdict['target'].append(target)

In [10]:
#Get dictionaries. Since the minimum size is 60 news, we specify 80 pages so that there is no strong class imbalance
data_dict = {}
data_dict['text']=[]
data_dict['target']=[]
add_list(data_dict,arts_text,'arts',80)
add_list(data_dict,sport_text,'sport',80)
add_list(data_dict,food_text,'food',80)
add_list(data_dict,travel_text,'travel',80)

In [11]:
#to dataFrame
data = pd.DataFrame(data_dict)

In [12]:
data.to_csv('text_data.csv') 
p = pd.DataFrame(data['target'].value_counts())
p

Unnamed: 0,target
sport,80
food,80
travel,80
arts,64


In [13]:
#preprocessing
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text) if w not in stopWords])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [14]:
data['text_lemmatized'] = data.text.apply(lemmatize_text)

In [15]:
#Checking
#length before deleting stop words and lemmatization
len(data['text'][0])

2367

In [16]:
#length after stopword removal and lemmatization
len(data['text_lemmatized'][0])

1687

In [17]:
#get samples
x = data['text_lemmatized']
y = data['target']
X_train, X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1,stratify=y)

In [18]:
#create and fit KNN and SGD
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd_clf', SGDClassifier(random_state=42))])
knb_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knb_clf', KNeighborsClassifier(n_neighbors=10))])
sgd_ppl_clf.fit(X_train, y_train)
knb_ppl_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('knb_clf', KNeighborsClassifier(n_neighbors=10))])

## SGDClassifier from sklearn

In [19]:
predicted = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted, y_test))

              precision    recall  f1-score   support

        arts       0.77      0.77      0.77        13
        food       0.94      0.79      0.86        19
       sport       0.81      0.76      0.79        17
      travel       0.50      0.67      0.57        12

    accuracy                           0.75        61
   macro avg       0.75      0.75      0.75        61
weighted avg       0.78      0.75      0.76        61



## KNeighborsClassifier from sklearn

In [20]:
predicted = knb_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted, y_test))

              precision    recall  f1-score   support

        arts       0.85      0.73      0.79        15
        food       1.00      0.76      0.86        21
       sport       0.88      0.82      0.85        17
      travel       0.44      0.88      0.58         8

    accuracy                           0.79        61
   macro avg       0.79      0.80      0.77        61
weighted avg       0.85      0.79      0.80        61



In [21]:
#GridSearch
parameters = { 
              'sgd_clf__loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
              'sgd_clf__class_weight':[None, 'balanced'],
              'sgd_clf__penalty':[None, 'l2', 'l1', 'elasticnet'],
              'tfidf__strip_accents':['ascii', 'unicode', None],
               'tfidf__ngram_range':[(1,2), (1,3), (1,4)]
              }
model = GridSearchCV(sgd_ppl_clf, parameters, cv=4, n_jobs=-1).fit(X_train, y_train)

In [22]:
print('Best score and parameter combination:')
print(model.best_score_, model.best_params_) 

Best score and parameter combination:
0.7693306010928962 {'sgd_clf__class_weight': None, 'sgd_clf__loss': 'log', 'sgd_clf__penalty': 'l2', 'tfidf__ngram_range': (1, 2), 'tfidf__strip_accents': 'ascii'}


Conclusion: the accuracy is not high, the probable reason is the dataset is too small

# CNN/RNN and datasets from HuggingFace

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 18.3 MB/s eta 0:00:01[K     |██                              | 20 kB 10.3 MB/s eta 0:00:01[K     |███                             | 30 kB 5.8 MB/s eta 0:00:01[K     |████                            | 40 kB 5.2 MB/s eta 0:00:01[K     |█████                           | 51 kB 4.0 MB/s eta 0:00:01[K     |██████                          | 61 kB 4.8 MB/s eta 0:00:01[K     |███████                         | 71 kB 4.7 MB/s eta 0:00:01[K     |████████                        | 81 kB 4.6 MB/s eta 0:00:01[K     |█████████                       | 92 kB 5.1 MB/s eta 0:00:01[K     |██████████                      | 102 kB 5.1 MB/s eta 0:00:01[K     |███████████                     | 112 kB 5.1 MB/s eta 0:00:01[K     |████████████                    | 122 kB 5.1 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 5.1 MB/s eta 0:00:01[

In [2]:
import random
import numpy as np
import gensim.downloader as api
import torch
import torch.nn as nn
import datasets

In [3]:
dataset = datasets.load_dataset("ag_news")
dataset["train"]

Downloading builder script:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset ag_news/default (download: 29.88 MiB, generated: 30.23 MiB, post-processed: Unknown size, total: 60.10 MiB) to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})

In [4]:
SEED = 0xDEAD
random.seed(SEED)
np.random.seed(SEED)
torch.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [6]:
word2vec = api.load("glove-twitter-50")



In [9]:
MAX_LENGTH=128

tokenizer = nltk.WordPunctTokenizer()

dataset = dataset.map(
    lambda item: {
        "tokenized": tokenizer.tokenize(item["text"])[:MAX_LENGTH]
    }
)

  0%|          | 0/120000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]

In [10]:
word2idx = {word: idx for idx, word in enumerate(word2vec.index2word)}

In [11]:
def encode(word):
    if word in word2idx.keys():
        return word2idx[word]
    return word2idx["unk"]
dataset = dataset.map(
    lambda item: {
        "features": [encode(word) for word in item["tokenized"]]
    }
)

  0%|          | 0/120000 [00:00<?, ?ex/s]

  0%|          | 0/7600 [00:00<?, ?ex/s]

In [12]:
dataset = dataset.remove_columns(["text", "tokenized"])

In [13]:
dataset.set_format(type='torch')

In [14]:
def collate_fn(batch):
    max_len = max(len(row["features"]) for row in batch)
    input_embeds = torch.empty((len(batch), max_len), dtype=torch.long)
    labels = torch.empty(len(batch), dtype=torch.long)
    for idx, row in enumerate(batch):
        to_pad = max_len - len(row["features"])
        input_embeds[idx] = torch.cat((row["features"], torch.zeros(to_pad)))
        labels[idx] = row["label"]
    return {"features": input_embeds, "labels": labels}

In [15]:
from torch.utils.data import DataLoader

loaders = {
    k: DataLoader(
        ds, shuffle=(k=="train"), batch_size=32, collate_fn=collate_fn
    ) for k, ds in dataset.items()
}

## CNN

In [16]:
class CNNModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()
        self.embeddings = nn.Embedding(len(word2idx), embedding_dim=embed_size)
        self.cnn = nn.Sequential(
            nn.Conv1d(embed_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
        )
        self.cl = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = self.embeddings(x)  # (batch_size, seq_len, embed_dim)
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        prediction = self.cl(x)
        return prediction

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CNNModel(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 1

In [18]:
from tqdm.notebook import tqdm, trange


def training(model, criterion, optimizer, num_epochs, loaders, max_grad_norm=2):
    for e in trange(num_epochs, leave=False):
        model.train()
        num_iter = 0
        pbar = tqdm(loaders["train"], leave=False)
        for batch in pbar:
            optimizer.zero_grad()
            input_embeds = batch["features"].to(device)
            labels = batch["labels"].to(device)
            prediction = model(input_embeds)
            loss = criterion(prediction, labels)
            loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            num_iter += 1
        valid_loss = 0
        valid_acc = 0
        num_iter = 0
        model.eval()
        with torch.no_grad():
            correct = 0
            num_objs = 0
            for batch in loaders["test"]:
                input_embeds = batch["features"].to(device)
                labels = batch["labels"].to(device)
                prediction = model(input_embeds)
                valid_loss += criterion(prediction, labels)
                correct += (labels == prediction.argmax(-1)).float().sum()
                num_objs += len(labels)
                num_iter += 1

        print(f"Valid Loss: {valid_loss / num_iter}, accuracy: {correct/num_objs}")

In [19]:
training(model, criterion, optimizer, num_epochs, loaders)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3750 [00:00<?, ?it/s]

Valid Loss: 0.42392584681510925, accuracy: 0.8565788865089417
