In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib
import string as st
import re
from nltk import PorterStemmer, WordNetLemmatizer
import os

In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names)
    targets.columns=['title']

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out['date'] = pd.to_datetime('now')
    out.to_csv('20_newsgroup.csv')
    
twenty_newsgroup_to_csv()

In [3]:
data = pd.read_csv('20_newsgroup.csv', index_col = 0)

In [4]:
data.shape

(18846, 4)

In [5]:
data.head()

Unnamed: 0,text,target,title,date
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,2022-04-02 15:59:14.676837
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey,2022-04-02 15:59:14.676837
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey,2022-04-02 15:59:14.676837
24,I don't know the exact coverage in the states....,10,rec.sport.hockey,2022-04-02 15:59:14.676837
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey,2022-04-02 15:59:14.676837


In [6]:
data.columns

Index(['text', 'target', 'title', 'date'], dtype='object')

In [7]:
data = data.drop(columns= ['date'])
data.head()

Unnamed: 0,text,target,title
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey
24,I don't know the exact coverage in the states....,10,rec.sport.hockey
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey


In [8]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/jpozoc/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_

[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to /home/jpozoc/nltk_data...
[nltk_data]    |   Package udhr is already up-to-date!
[nltk_data]    | Downloading package udhr2 to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package udhr2 is already up-to-date!
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package unicode_samples is already up-to-date!
[nltk_data]    | Downloading package universal_tagset to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package universal_tagset is already up-to-date!
[nltk_data]    | Downloading package universal_treebanks_v20 to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_data]    |   Package universal_treebanks_v20 is already up-to-
[nltk_data]    |       date!
[nltk_data]    | Downloading package vader_lexicon to
[nltk_data]    |     /home/jpozoc/nltk_data...
[nltk_dat

True

In [9]:
# Remove all punctuations from the text
text = data["text"].tolist()
remove_punc = [''.join(c for c in str(s) if c not in st.punctuation) for s in text]

In [10]:
data['removed_punc'] = remove_punc
data.head()

Unnamed: 0,text,target,title,removed_punc
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey,\nstuff deleted\n\nOk heres the solution to yo...
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey,\n\n\nYeah its the second one And I believe t...
24,I don't know the exact coverage in the states....,10,rec.sport.hockey,I dont know the exact coverage in the states ...
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey,Here are the NHLs alltime leaders in goals and...


In [11]:
''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
    on special characters, tabs or any other string based on which text is to be seperated into tokens.
'''
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [12]:
data['tokens'] = data['removed_punc'].apply(lambda msg : tokenize(msg))
data.head()

Unnamed: 0,text,target,title,removed_punc,tokens
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...,"[, i, am, sure, some, bashers, of, pens, fans,..."
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey,\nstuff deleted\n\nOk heres the solution to yo...,"[, stuff, deleted, ok, heres, the, solution, t..."
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey,\n\n\nYeah its the second one And I believe t...,"[, yeah, its, the, second, one, and, i, believ..."
24,I don't know the exact coverage in the states....,10,rec.sport.hockey,I dont know the exact coverage in the states ...,"[i, dont, know, the, exact, coverage, in, the,..."
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey,Here are the NHLs alltime leaders in goals and...,"[here, are, the, nhls, alltime, leaders, in, g..."


In [13]:
# Remove tokens of length less than 3

def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [14]:
data['larger_tokens'] = data['tokens'].apply(lambda x : remove_small_words(x))
data.head()

Unnamed: 0,text,target,title,removed_punc,tokens,larger_tokens
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...,"[, i, am, sure, some, bashers, of, pens, fans,...","[sure, some, bashers, pens, fans, pretty, conf..."
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey,\nstuff deleted\n\nOk heres the solution to yo...,"[, stuff, deleted, ok, heres, the, solution, t...","[stuff, deleted, heres, solution, your, proble..."
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey,\n\n\nYeah its the second one And I believe t...,"[, yeah, its, the, second, one, and, i, believ...","[yeah, second, believe, that, price, been, try..."
24,I don't know the exact coverage in the states....,10,rec.sport.hockey,I dont know the exact coverage in the states ...,"[i, dont, know, the, exact, coverage, in, the,...","[dont, know, exact, coverage, states, canada, ..."
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey,Here are the NHLs alltime leaders in goals and...,"[here, are, the, nhls, alltime, leaders, in, g...","[here, nhls, alltime, leaders, goals, points, ..."


In [15]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [16]:
data['clean_tokens'] = data['larger_tokens'].apply(lambda x : remove_stopwords(x))
data.head()

Unnamed: 0,text,target,title,removed_punc,tokens,larger_tokens,clean_tokens
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...,"[, i, am, sure, some, bashers, of, pens, fans,...","[sure, some, bashers, pens, fans, pretty, conf...","[sure, bashers, pens, fans, pretty, confused, ..."
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey,\nstuff deleted\n\nOk heres the solution to yo...,"[, stuff, deleted, ok, heres, the, solution, t...","[stuff, deleted, heres, solution, your, proble...","[stuff, deleted, heres, solution, problem, mov..."
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey,\n\n\nYeah its the second one And I believe t...,"[, yeah, its, the, second, one, and, i, believ...","[yeah, second, believe, that, price, been, try...","[yeah, second, believe, price, trying, good, l..."
24,I don't know the exact coverage in the states....,10,rec.sport.hockey,I dont know the exact coverage in the states ...,"[i, dont, know, the, exact, coverage, in, the,...","[dont, know, exact, coverage, states, canada, ...","[dont, know, exact, coverage, states, canada, ..."
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey,Here are the NHLs alltime leaders in goals and...,"[here, are, the, nhls, alltime, leaders, in, g...","[here, nhls, alltime, leaders, goals, points, ...","[nhls, alltime, leaders, goals, points, 19923,..."


### Lemmatization converts word to it's dictionary base form. This process takes language grammar and vocabulary into consideration while conversion. Hence, it is different from Stemming in that it does not merely truncate the suffixes to get the root word.


In [17]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [18]:
data['lemma_words'] = data['clean_tokens'].apply(lambda x : lemmatize(x))
data.head()

Unnamed: 0,text,target,title,removed_punc,tokens,larger_tokens,clean_tokens,lemma_words
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...,"[, i, am, sure, some, bashers, of, pens, fans,...","[sure, some, bashers, pens, fans, pretty, conf...","[sure, bashers, pens, fans, pretty, confused, ...","[sure, bashers, pen, fan, pretty, confused, la..."
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey,\nstuff deleted\n\nOk heres the solution to yo...,"[, stuff, deleted, ok, heres, the, solution, t...","[stuff, deleted, heres, solution, your, proble...","[stuff, deleted, heres, solution, problem, mov...","[stuff, deleted, here, solution, problem, move..."
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey,\n\n\nYeah its the second one And I believe t...,"[, yeah, its, the, second, one, and, i, believ...","[yeah, second, believe, that, price, been, try...","[yeah, second, believe, price, trying, good, l...","[yeah, second, believe, price, trying, good, l..."
24,I don't know the exact coverage in the states....,10,rec.sport.hockey,I dont know the exact coverage in the states ...,"[i, dont, know, the, exact, coverage, in, the,...","[dont, know, exact, coverage, states, canada, ...","[dont, know, exact, coverage, states, canada, ...","[dont, know, exact, coverage, state, canada, c..."
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey,Here are the NHLs alltime leaders in goals and...,"[here, are, the, nhls, alltime, leaders, in, g...","[here, nhls, alltime, leaders, goals, points, ...","[nhls, alltime, leaders, goals, points, 19923,...","[nhls, alltime, leader, goal, point, 19923, se..."


In [19]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [20]:
data['clean_text'] = data['lemma_words'].apply(lambda x : return_sentences(x))
data.head()

Unnamed: 0,text,target,title,removed_punc,tokens,larger_tokens,clean_tokens,lemma_words,clean_text
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...,"[, i, am, sure, some, bashers, of, pens, fans,...","[sure, some, bashers, pens, fans, pretty, conf...","[sure, bashers, pens, fans, pretty, confused, ...","[sure, bashers, pen, fan, pretty, confused, la...",sure bashers pen fan pretty confused lack kind...
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey,\nstuff deleted\n\nOk heres the solution to yo...,"[, stuff, deleted, ok, heres, the, solution, t...","[stuff, deleted, heres, solution, your, proble...","[stuff, deleted, heres, solution, problem, mov...","[stuff, deleted, here, solution, problem, move...",stuff deleted here solution problem move canad...
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey,\n\n\nYeah its the second one And I believe t...,"[, yeah, its, the, second, one, and, i, believ...","[yeah, second, believe, that, price, been, try...","[yeah, second, believe, price, trying, good, l...","[yeah, second, believe, price, trying, good, l...",yeah second believe price trying good look bru...
24,I don't know the exact coverage in the states....,10,rec.sport.hockey,I dont know the exact coverage in the states ...,"[i, dont, know, the, exact, coverage, in, the,...","[dont, know, exact, coverage, states, canada, ...","[dont, know, exact, coverage, states, canada, ...","[dont, know, exact, coverage, state, canada, c...",dont know exact coverage state canada covered ...
44,Here are the NHL's alltime leaders in goals an...,10,rec.sport.hockey,Here are the NHLs alltime leaders in goals and...,"[here, are, the, nhls, alltime, leaders, in, g...","[here, nhls, alltime, leaders, goals, points, ...","[nhls, alltime, leaders, goals, points, 19923,...","[nhls, alltime, leader, goal, point, 19923, se...",nhls alltime leader goal point 19923 season mu...


## Prepare data for modeling

In [21]:
X = data.clean_text
Y = data.target

print(X.shape)
print(Y.shape)

(18846,)
(18846,)


In [22]:
# split X and y into training and testing sets 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(15076,)
(3770,)
(15076,)
(3770,)


## TF-IDF Vectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

#encode the text

vect = TfidfVectorizer(stop_words='english')

X_train_dtm = vect.fit_transform(X_train)

In [24]:
X_train_dtm

<15076x110725 sparse matrix of type '<class 'numpy.float64'>'
	with 813953 stored elements in Compressed Sparse Row format>

In [25]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<3770x110725 sparse matrix of type '<class 'numpy.float64'>'
	with 177872 stored elements in Compressed Sparse Row format>

## Modeling

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV

#Create a svm Classifier
model = SVC()

param_grid = {'C':[0.1], 'kernel':['poly']}

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='balanced_accuracy')
# execute the grid search
model = grid_search.fit(X_train_dtm, Y_train)
# summarize the best score and configuration
print("Best: %f using %s" % (model.best_score_, model.best_params_))
# summarize all scores that were evaluated
means = model.cv_results_['mean_test_score']
stds = model.cv_results_['std_test_score']
params = model.cv_results_['params']


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, balanced_accuracy_score

svm_acc = accuracy_score(Y_test, model.predict(X_test_dtm))
svm_f1 = f1_score(Y_test, model.predict(X_test_dtm), average ='weighted')
svm_bacc = balanced_accuracy_score(Y_test, model.predict(X_test_dtm))

print(f"Training Accuracy of Support Vector Machine is {accuracy_score(Y_train, model.predict(X_train_dtm))}")
print(f"Test Accuracy of Support Vector Machine is {svm_acc} \n")
print(f"Training Balanced Accuracy of Support Vector Machine is {balanced_accuracy_score(Y_train, model.predict(X_train_dtm))}")
print(f"Test Balanced Accuracy of Support Vector Machine is {svm_bacc} \n")
print(f"Training F1-score weighted of Support Vector Machine is {f1_score(Y_train, model.predict(X_train_dtm),average ='weighted')}")
print(f"Test F1-score weighted of Support Vector Machine is {svm_f1} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(Y_test, model.predict(X_test_dtm))}\n")
print(f"Classification Report :- \n {classification_report(Y_test, model.predict(X_test_dtm))}")