In [109]:
#import libraries
import os, io, re, tqdm, datetime, nltk #nltk mecbur olmasaz yuklemeyin

import pandas as pd, numpy as np, matplotlib as mpl, matplotlib.pyplot as plt, seaborn as sns

from torch import nn
from torchtext import vocab
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim import Adam, Adagrad, RMSprop, SGD

#ignore warnings
import warnings
warnings.filterwarnings(action= 'ignore')

#set parameters for "Matplotlib"
mpl.rcParams['figure.figsize']  = [16, 9]

In [None]:
# pip install torch torchtext -f https://download.pytorch.org/whl/torch_stable.html
# pip uninstall torch torchtext -y
# pip install torch==2.1.0 torchtext==0.16.0

### About Dataset
#### AG's News Topic Classification Dataset
### ORIGIN
AG is a collection of more than 1 million news articles. News articles have been gathered from more than 2000 news sources by ComeToMyHead in more than 1 year of activity. ComeToMyHead is an academic news search engine which has been running since July, 2004. The dataset is provided by the academic comunity for research purposes in data mining (clustering, classification, etc), information retrieval (ranking, search, etc), xml, data compression, data streaming, and any other non-commercial activity. For more information, please refer to the link http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .

The AG's news topic classification dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the dataset above. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).

### DESCRIPTION
The AG's news topic classification dataset is constructed by choosing 4 largest classes from the original corpus. Each class contains 30,000 training samples and 1,900 testing samples. The total number of training samples is 120,000 and testing 7,600.

The file classes.txt contains a list of classes corresponding to each label.

The files train.csv and test.csv contain all the training samples as comma-sparated values. There are 3 columns in them, corresponding to class index (1 to 4), title and description. The title and description are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\n".

#### About this file

This file consists of 7600 testing samples of news articles that contain 3 columns. The first column is Class Id, the second column is Title and the third column is Description. 

**The class ids are numbered 1-4 where**
* 1 ------------> World
* 2 ------------> Sports
* 3 ------------> Business
* 4 ------------> Sci/Tech.

In [67]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
df = pd.concat([train_df, test_df], axis= 0)
df.index = range(df.shape[0])
df['text'] = 'The title of news is ' + df['Title'] + ' .The news are below ' +  df['Description']
df.rename({'Class Index': 'label'}, axis= 1, inplace= True)
df.drop(['Title', 'Description'], axis= 1, inplace= True)
df = df[['text', 'label']]
# df.to_csv('Final_df.csv', index= False)

In [148]:
class GetDataset:
    def __init__(self, dataset):
        self.label = dataset['label']
        self.text = dataset['text']
        
    def __len__(self):
        return len(self.label) #tells pytorch how many rows does dataframe have
    
    def __getitem__(self, idx): #__getitem__   men: __iter__
        label = self.label.iloc[idx]
        text = self.text.iloc[idx]
        return text, label
    
dataset = GetDataset(df)
dataset

<__main__.GetDataset at 0x2cbac4430>

## 📌 Key Parameters

### 🔹 Short Explanation

- **`vocab_size`**: Number of unique words (tokens) in your dataset's vocabulary.
- **`embed_dim`**: Size of each word embedding vector (e.g., 100, 300).
- **`num_class`**: Number of output categories (labels) for classification.

---

### 🔍 Detailed Explanation

#### 1. 🧾 `vocab_size` — Vocabulary Size
- Refers to the **total number of unique tokens** (words, subwords, or characters) in your dataset.
- After tokenization, you build a vocabulary.  
- Example: If you have 10,000 unique words, then `vocab_size = 10000`.
- In `nn.Embedding`, this tells PyTorch **how many rows** to create in the embedding matrix (one for each token).

---

#### 2. 📏 `embed_dim` — Embedding Dimension
- Each token is converted into a **dense vector** of fixed size.
- `embed_dim` defines **how many features** each word is represented by.
- Common values: `50`, `100`, `300`, `768` (used in BERT).
- If `embed_dim = 100`, then:
  - "king" → `[0.12, -0.45, ..., 0.33]` (a 100-dimensional vector)

✅ Embeddings help capture **semantic meaning** — similar words have similar vectors.

---

#### 3. 🎯 `num_class` — Number of Output Classes
- Defines the number of **categories/labels** your model predicts.
  - For binary classification → `num_class = 2`
  - For sentiment classification (pos/neg/neutral) → `num_class = 3`
  - For topic classification → `num_class = N`

- The final layer is often:
  ```python
  nn.Linear(in_features, num_class)


## Embedding Dimensonlarda reqem nece yaranir? (Embedding layer)

## Define the model

In [142]:
file_path = 'Final_df.csv'
tokenizer = get_tokenizer('basic_english')
def yield_tokens(file_path):
    with io.open(file_path) as f:
        for line in f:
            yield tokenizer(line.strip())
            
vocab = build_vocab_from_iterator(yield_tokens(file_path), specials= ['<unk>'])
vocab_size = len(vocab)

In [143]:
class NewsClassificationGRU(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(NewsClassificationGRU, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings= vocab_size, embedding_dim= embed_dim)
        self.gru_layer = nn.GRU(64, hidden_size= 128, num_layers= 1)  #RNN, GRU, LSTM - sequence tipli datalarla ishlemek ucun
                                                    #istifade olunur. Sequence - NLP ve Time Series
        self.dense_layer = nn.Linear(128, num_class)
        self.softmax_act_fc = nn.Softmax()
        
    def forward(self, X):
        X = self.embedding_layer(X)
        X = self.gru_layer(X)
        X = self.dense_layer(X)
        X = self.softmax_act_fc(X)
        return X
    
model = NewsClassificationGRU(vocab_size= vocab_size, embed_dim= 64, num_class= 4)
model

NewsClassificationGRU(
  (embedding_layer): Embedding(98628, 64)
  (gru_layer): GRU(64, 128)
  (dense_layer): Linear(in_features=128, out_features=4, bias=True)
  (softmax_act_fc): Softmax(dim=None)
)

In [149]:
train_size = int(df.shape[0] * 0.9)
train_subset, test_subset = random_split(dataset, lengths= [train_size, df.shape[0] - train_size])
train_data_loader = DataLoader(train_subset)
test_data_loader = DataLoader(test_subset)

## Train the model

In [152]:
model.train()

for (X, y) in test_data_loader:
    model(X)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not tuple

In [151]:
X

('The title of news is American Express to cut 2,000 jobs .The news are below American Express Co said it will cut 2,000 jobs, or 2.5 per cent of its work force, in a restructuring designed to save more than USD 75 million a year before taxes.',)