# Tutorial - Text Mining - Classification 

We will predict the category of discussion posts in a newsgroup.

**The unit of analysis is a discussion post**

### Import common packages

In [241]:
import pandas as pd
import numpy as np

np.random_seed = 113

### Load data

In [242]:
news = pd.read_csv('news.csv')

news.shape


(597, 5)

In [243]:
news.head(5)

Unnamed: 0,TEXT,graphics,hockey,medical,newsgroup
0,I have a few reprints left of chapters from my...,1,0,0,graphics
1,"gnuplot, etc. make it easy to plot real valued...",1,0,0,graphics
2,Article-I.D.: snoopy.1pqlhnINN8k1 References: ...,1,0,0,graphics
3,"Hello, I am looking to add voice input capabil...",1,0,0,graphics
4,I recently got a file describing a library of ...,1,0,0,graphics


### Check for missing values

In [244]:
news[['TEXT']].isna().sum()

TEXT    0
dtype: int64

## Assign the input variable to X and the target variable to y

In [245]:
X = news['TEXT']

### Lemmatizing the Data 
We can use the Lemmatizer technique to reduce words to their stems.

In [246]:
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag, word_tokenize

transformed_corpus = []
wnl = WordNetLemmatizer()
for document in X:
    transformed_document = ""
    for word, tag in pos_tag(word_tokenize(document)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wnl.lemmatize(word, wntag)
        transformed_document+= lemma + " "
    transformed_corpus += [transformed_document]

In [247]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, token_pattern="[^\W\d_]+")

X = vectorizer.fit_transform(transformed_corpus)

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,aa,aalborg,aamrl,aangeboden,aantal,aaplay,aarnet,ab,abad,abandon,...,zoo,zool,zorn,zt,zu,zubov,zupancic,zurich,zyeh,zzz
0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.09234,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.1337,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
593,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
595,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [248]:
from sklearn.decomposition import TruncatedSVD

In [249]:
svd = TruncatedSVD(n_components=5, n_iter=10)

In [250]:
X_svd = svd.fit_transform(X)
X_svd

array([[ 0.06969793, -0.01013072, -0.05856669, -0.00553745, -0.02275749],
       [ 0.08003811, -0.01699631, -0.03159225,  0.01830167, -0.03414697],
       [ 0.17629362, -0.0052785 , -0.07390732,  0.04346976, -0.03203429],
       ...,
       [ 0.17703261,  0.0393084 , -0.01412099,  0.15279041, -0.11143232],
       [ 0.20013663, -0.04008023, -0.12591094, -0.01888981,  0.37090449],
       [ 0.13386535, -0.0071889 , -0.07979918, -0.03873065,  0.03543206]])

In [251]:
X_svd.shape[1]

5

In [252]:
df = pd.DataFrame(X_svd, columns=[f"svd{num:04}" for num in range(0,X_svd.shape[1])])
df

Unnamed: 0,svd0000,svd0001,svd0002,svd0003,svd0004
0,0.069698,-0.010131,-0.058567,-0.005537,-0.022757
1,0.080038,-0.016996,-0.031592,0.018302,-0.034147
2,0.176294,-0.005279,-0.073907,0.043470,-0.032034
3,0.108293,-0.030671,-0.126805,-0.013344,-0.057505
4,0.091739,-0.060481,-0.143298,0.010667,-0.051451
...,...,...,...,...,...
592,0.137000,-0.035743,-0.152590,-0.021686,0.439880
593,0.266933,-0.067205,-0.167790,-0.045026,0.501719
594,0.177033,0.039308,-0.014121,0.152790,-0.111432
595,0.200137,-0.040080,-0.125911,-0.018890,0.370904


This is a multi-class classification problem. There are three categories we will predict:<br>
Whether a post is "graphics," "hockey," or "medical" related

In [253]:
y = news['newsgroup']
y.unique()

array(['graphics', 'hockey', 'medical'], dtype=object)

In [254]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(y)
print(le.classes_)
y = le.transform(y)

y


['graphics' 'hockey' 'medical']


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Split the data

In [255]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3)

In [256]:
X_train.shape, y_train.shape

((417, 5), (417,))

In [257]:
X_test.shape, y_test.shape

((180, 5), (180,))

In [258]:
X_train.head(5)

Unnamed: 0,svd0000,svd0001,svd0002,svd0003,svd0004
595,0.200137,-0.04008,-0.125911,-0.01889,0.370904
354,0.113997,-0.123775,0.125871,-0.033008,-0.012466
306,0.130113,-0.122803,0.122035,-0.067902,-0.018523
143,0.11294,-0.051322,-0.117085,0.021786,-0.010703
517,0.202396,0.142427,0.002273,0.008683,-0.024995


In [259]:
y_train[:5]

array([2, 1, 1, 0, 2])

## Random Forest

In [260]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1) 
_ = rnd_clf.fit(X_train, y_train)

### Evaluating Model Performance

In [261]:
from sklearn.metrics import accuracy_score

In [262]:
#Train accuracy - Not a good measure of model performance as we are using the same data set to train and test
y_pred_train = rnd_clf.predict(X_train)
acc = accuracy_score(y_train, y_pred_train)
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.9305


In [263]:
#Test accuracy
y_pred_test = rnd_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred_test)
print(f"Train acc: {accuracy_score(y_test, y_pred_test):.4f}")

Train acc: 0.8778


In [264]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_test)

array([[55,  1, 11],
       [ 2, 56,  2],
       [ 5,  1, 47]], dtype=int64)

## Stochastic Gradient Descent Classifier

In [265]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=100)
_ = sgd_clf.fit(X_train, y_train)

### Evaluating Model Performance

In [266]:
#Train accuracy
y_pred_train = sgd_clf.predict(X_train)
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.8777


In [267]:
#Test accuracy
y_pred_test = sgd_clf.predict(X_test)
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.8777


In [268]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_test)

array([[62,  0,  5],
       [ 3, 57,  0],
       [12,  0, 41]], dtype=int64)