<a href="https://colab.research.google.com/github/hydradon/modelling-with-pythhon/blob/master/ULMFiT_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Practice modelling with ULMFiT to classify news report

In [None]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

In [2]:
import fastai
from fastai import *
from fastai.text import *
import pandas as pd
import numpy as np

from functools import partial
import io
import os

## Using the 20 Newsgroup text dataset

In [None]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [19]:
df = pd.DataFrame({'label' : dataset.target, 'text': dataset.data})
df.shape

(11314, 2)

### Only use group 1 and 10 for classification

In [21]:
df = df[df["label"].isin([1, 10])]
df.reset_index(drop = True, inplace = True)
df

Unnamed: 0,label,text
0,10,"Well, I will have to change the scoring on my ..."
1,1,Archive-name: graphics/resources-list/part1\nL...
2,10,"\nAnd of course, Mike Ramsey was (at one time)..."
3,10,"As I promised, I would give you the name of th..."
4,10,GAME(S) OF 4/15\n---------------\nADIRONDACK 6...
...,...,...
1179,10,The Hawks win!! Jermey Roenick scored his 50 ...
1180,10,I think that NHLPA' 93 is the best video game ...
1181,1,\nI am in the market for a 24-bit graphics car...
1182,1,"Hi there,\n\nis there anybody who know a polyg..."


In [22]:
df["label"].value_counts()

10    600
1     584
Name: label, dtype: int64

### Preprocessing data

In [23]:
# Remove non alphabetical characters
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")

In [None]:
# Download stopwords
import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

In [34]:
# Tokenization
df["text_preprocessed"] = df["text"].apply(lambda x: x.split())
# Remove stopwords
df["text_preprocessed"] = df["text_preprocessed"].apply(lambda x: [token for token in x if token not in stop_words])
# Merge tokens back
df["text_preprocessed"] = df["text_preprocessed"].apply(lambda x: " ".join(x))


In [35]:
df

Unnamed: 0,label,text,text_preprocessed
0,10,Well I will have to change the scoring on my ...,Well I change scoring playoff pool Unfortunate...
1,1,Archive name graphics resources list part La...,Archive name graphics resources list part Last...
2,10,And of course Mike Ramsey was at one time ...,And course Mike Ramsey one time captain Buffal...
3,10,As I promised I would give you the name of th...,As I promised I would give name Panther presid...
4,10,GAME S OF ADIRONDACK C...,GAME S OF ADIRONDACK CDI Adirondack leads seri...
...,...,...,...
1179,10,The Hawks win Jermey Roenick scored his ...,The Hawks win Jermey Roenick scored th goal Ha...
1180,10,I think that NHLPA is the best video game ...,I think NHLPA best video game available course...
1181,1,I am in the market for a bit graphics card...,I market bit graphics card PC ISA bus wonderin...
1182,1,Hi there is there anybody who know a polygon...,Hi anybody know polygon reduction algorithm ma...


### Prepare for training

In [37]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, random_state = 12,
                                     stratify = df["label"], 
                                     test_size = 0.4)
df_train.shape, df_test.shape

((710, 3), (474, 3))

### Prepare model

In [41]:
# Language model
lang_model = TextLMDataBunch.from_df(train_df = df_train, 
                                     valid_df = df_test, 
                                     path = "")

# Classifier model
data_clas = TextClasDataBunch.from_df(path = "",
                                      train_df = df_train,
                                      valid_df = df_test,
                                      vocab = lang_model.train_ds.vocab,
                                      bs = 32)

### Fine-tuning on pre-trained model

In [45]:
learn = language_model_learner(lang_model, 
                               arch = AWD_LSTM, 
                               drop_mult=0.7)

Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd.tgz


In [46]:
# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,5.029365,4.356495,0.25269,00:07


In [47]:
# Saving the encoder
learn.save_encoder('ft_enc')

### Building classifier

In [49]:
learn = text_classifier_learner(data_clas, arch = AWD_LSTM, drop_mult=0.7)
learn.load_encoder("ft_enc")

RNNLearner(data=TextClasDataBunch;

Train: LabelList (710 items)
x: TextList
xxbos xxmaj it looks like the xxmaj edmonton xxmaj oilers just decided to take a xxmaj european xxunk this spring xxmaj ranford xxmaj tugnutt xxmaj benning xxmaj manson xxmaj smith xxmaj buchberger and xxmaj corson are playing for xxmaj canada xxmaj podein and xxmaj weight are playing for the xxup us xxmaj is xxmaj kravchuk playing for the xxmaj xxunk i know he had nagging injuries late in the season xxmaj podein is an interesting case because he was eligible to play in xxmaj cape xxmaj breton in the xxup ahl playoffs like xxmaj kovalev xxmaj zubov and xxmaj andersson obviously xxmaj sather and xxmaj pocklington are not the total xxunk everyone makes them out to be certainly in this case they ve massively xxunk xxmaj paramount and the xxmaj new xxmaj york xxmaj rangers,xxbos xxmaj this is what xxunk me xxmaj speaking of die hard that s what i did when i read this xxunk hard xxunk xxmaj toronto to the xxmaj cup

In [50]:
# Again train the learner object with learning rate = 1e-2
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.360449,0.195526,0.943038,00:15


In [None]:
# Get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis=1)

In [55]:
from sklearn.metrics import classification_report
# pd.crosstab(predictions, targets)
print(classification_report(predictions, targets))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94       253
           1       0.90      0.98      0.94       221

    accuracy                           0.94       474
   macro avg       0.94      0.95      0.94       474
weighted avg       0.95      0.94      0.94       474

