In [15]:
# Pull down the raw CSVs from GitHub
!wget -q https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
!wget -q https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv

In [16]:
import pandas as pd

train_df = pd.read_csv(
    "train.csv",
    header=None,
    names=["label","title","description"]
)
test_df = pd.read_csv(
    "test.csv",
    header=None,
    names=["label","title","description"]
)

train_df["text"] = train_df["title"] + " " + train_df["description"]
test_df["text"]  = test_df["title"]  + " " + test_df["description"]

In [17]:

print(train_df.shape, test_df.shape)
print(train_df.head())

(120000, 4) (7600, 4)
   label                                              title  \
0      3  Wall St. Bears Claw Back Into the Black (Reuters)   
1      3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2      3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3      3  Iraq Halts Oil Exports from Main Southern Pipe...   
4      3  Oil prices soar to all-time record, posing new...   

                                         description  \
0  Reuters - Short-sellers, Wall Street's dwindli...   
1  Reuters - Private investment firm Carlyle Grou...   
2  Reuters - Soaring crude prices plus worries\ab...   
3  Reuters - Authorities have halted oil export\f...   
4  AFP - Tearaway world oil prices, toppling reco...   

                                                text  
0  Wall St. Bears Claw Back Into the Black (Reute...  
1  Carlyle Looks Toward Commercial Aerospace (Reu...  
2  Oil and Economy Cloud Stocks' Outlook (Reuters...  
3  Iraq Halts Oil Exports from Main Southe

In [18]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [19]:
!pip install --quiet nltk

In [20]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
import re
import string
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))


def preprocess_text_simple(text):
    # 1. Lowercase
    text = text.lower()
    # 2. Remove digits
    text = re.sub(r"\d+", "", text)
    # 3. Strip punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # 4. Split on whitespace
    tokens = text.split()
    # 5. Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    # 6. Re-join
    return " ".join(tokens)

# Testing
import pandas as pd
sample = train_df.iloc[:5].copy()
sample["cleaned"] = sample["text"].apply(preprocess_text_simple)
print(sample[["text","cleaned"]])

                                                text  \
0  Wall St. Bears Claw Back Into the Black (Reute...   
1  Carlyle Looks Toward Commercial Aerospace (Reu...   
2  Oil and Economy Cloud Stocks' Outlook (Reuters...   
3  Iraq Halts Oil Exports from Main Southern Pipe...   
4  Oil prices soar to all-time record, posing new...   

                                             cleaned  
0  wall st bears claw back black reuters reuters ...  
1  carlyle looks toward commercial aerospace reut...  
2  oil economy cloud stocks outlook reuters reute...  
3  iraq halts oil exports main southern pipeline ...  
4  oil prices soar alltime record posing new mena...  


In [22]:
sample_rand = train_df.sample(n=5, random_state=42).copy()
sample_rand

Unnamed: 0,label,title,description,text
71787,3,"BBC set for major shake-up, claims newspaper","London - The British Broadcasting Corporation,...","BBC set for major shake-up, claims newspaper L..."
67218,3,Marsh averts cash crunch,Embattled insurance broker #39;s banks agree t...,Marsh averts cash crunch Embattled insurance b...
54066,2,"Jeter, Yankees Look to Take Control (AP)",AP - Derek Jeter turned a season that started ...,"Jeter, Yankees Look to Take Control (AP) AP - ..."
7168,4,Flying the Sun to Safety,When the Genesis capsule comes back to Earth w...,Flying the Sun to Safety When the Genesis caps...
29618,3,Stocks Seen Flat as Nortel and Oil Weigh,NEW YORK (Reuters) - U.S. stocks were set to ...,Stocks Seen Flat as Nortel and Oil Weigh NEW ...
