In [0]:
# https://medium.com/analytics-vidhya/tutorial-on-text-classification-nlp-using-ulmfit-and-fastai-library-in-python-2f15a2aac065

In [28]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html 

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html


In [29]:
!pip install fastai



In [30]:
!pip install dataclasses



In [0]:
import fastai 
from fastai import * 
from fastai.text import * 
import pandas as pd 
import numpy as np 
from functools import partial 
import io 
import os

In [0]:
from sklearn.datasets import fetch_20newsgroups 
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove= 
                             ('headers', 'footers', 'quotes'))
documents = dataset.data

In [33]:
documents[:10]

["Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n",
 "\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap

In [34]:
df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})
df.shape

(11314, 2)

In [35]:
df.head()

Unnamed: 0,label,text
0,17,Well i'm not sure about the story nad it did s...
1,0,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,17,Although I realize that principle is not one o...
3,11,Notwithstanding all the legitimate fuss about ...
4,10,"Well, I will have to change the scoring on my ..."


In [0]:
#df = df[df['label'].isin([1,10])] 
df = df[df['label']<10]
df = df.reset_index(drop = True)

In [52]:
df['label'].value_counts()

8    598
9    597
7    594
5    593
2    591
3    590
6    585
1    584
4    578
0    480
Name: label, dtype: int64

In [0]:
# data preprocessing

In [0]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")

In [55]:
df.head()

Unnamed: 0,label,text
0,0,Yeah expect people read FAQ etc actually accep...
1,4,Ok I record shows IIsi without KB cache It sma...
2,1,Archive name graphics resources list part Last...
3,6,I Roberto Clemente Topps baseball card sale ne...
4,4,The title says I need know c rom versions


In [56]:
!pip install nltk



In [57]:
import nltk 
nltk.download('stopwords') 
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split()) 
# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x:[item for item in x if 
                                    item not in stop_words]) 

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)):
    t =' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 
df['text'] = detokenized_doc

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
from sklearn.model_selection import train_test_split 
# split data into training and validation set 
df_trn, df_val = train_test_split(df, stratify = df['label'], 
                                  test_size = 0.4, 
                                  random_state = 12)
df_trn.shape, df_val.shape

((3474, 2), (2316, 2))

In [0]:
# Language model data 
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = 
                                  df_val, path = "") 
# Classifier model data 
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, 
                                      valid_df = df_val,  
                                      vocab=data_lm.train_ds.vocab, 
                                      bs=32)

In [0]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103,  
                               drop_mult=0.7)

In [61]:
# train the learner object with learning rate = 1e-2 
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,6.060234,5.256199,0.246890


In [0]:
learn.save_encoder('ft_enc')

In [0]:
learn = text_classifier_learner(data_clas, drop_mult=0.7) 
learn.load_encoder('ft_enc')

In [64]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,2.113365,1.745411,0.358808


In [65]:
# get predictions 
preds, targets = learn.get_preds() 
predictions = np.argmax(preds, axis = 1) 
pd.crosstab(predictions, targets)

col_0,0,1,2,3,4,5,6,7,8,9
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,123,7,4,4,4,4,1,11,6,1
1,0,5,1,0,0,5,0,0,0,1
2,11,76,65,32,25,72,17,5,8,5
3,0,51,84,142,119,45,88,21,6,0
4,1,9,8,15,10,5,2,2,2,1
5,1,35,18,5,8,66,3,3,0,2
6,3,22,18,15,19,10,65,13,16,11
7,23,19,22,12,26,17,18,87,90,38
8,4,6,8,7,18,5,32,87,94,6
9,26,4,8,4,2,8,8,9,17,174
