# Text Classification with TF IDF

In [1]:
# Data processing
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer

# Visualization
from matplotlib import pyplot as plt

# Text processing
import re
import nltk

# Various
from datetime import datetime



# EDA of Dataset

In [2]:
# Select the dataset for the analysis
df = pd.read_csv("all-data-v1.csv")

# Show the data
df.head()

Unnamed: 0,sentiment,text
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...


In [3]:
print('Total number of news: {}'.format(len(df)))
print(40*'-')
print('Split by category:')
print(df["sentiment"].value_counts())
print(40*'-')
nr_categories = len(df["sentiment"].unique())
print("Number of categories: {n}".format(n=nr_categories))

Total number of news: 4846
----------------------------------------
Split by category:
1    2879
2    1363
0     604
Name: sentiment, dtype: int64
----------------------------------------
Number of categories: 3


# Preprocessing

In [4]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
stopWords = stopwords.words('english') # from NLTK



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ivanpua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Defining a Function to clean up the reviews 
def utils_preprocess_text(text, 
                          flg_stemm=False, 
                          flg_lemm=False, 
                          lst_stopwords=None):
    # Clean (convert to lowercase and remove punctuations and characters and then strip)
    # The function is not optimized for speed but split into various steps for pedagogical purpose
    text = str(text).lower()
    text = re.sub("[!?.,@#$%\^*()';:/~<>]", '', text) # remove punctuations
    text = text.replace('-', ' ') # replace 
    text = text.replace("&", 'and') # replace
    text = text.strip()

    # Tokenize (convert from string to list)
    lst_text = text.split()
    # remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]

    # Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    # Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    # back to string from list
    text = " ".join(lst_text)
    return text


In [6]:
# Let's apply this function to the whole corpus
df["text_clean"] = df["text"].apply(
    lambda x: utils_preprocess_text(x, 
                                    flg_stemm=True, 
                                    flg_lemm=True, 
                                    lst_stopwords=stopWords
                                   ))

# Let's look at the output
df.head()

Unnamed: 0,sentiment,text,text_clean
0,1,"According to Gran , the company has no plans t...",accord gran compani plan move product russia a...
1,1,Technopolis plans to develop in stages an area...,technopoli plan develop stage area le 100000 s...
2,0,The international electronic industry company ...,intern electron industri compani elcoteq laid ...
3,2,With the new production plant the company woul...,new product plant compani would increas capac ...
4,2,According to the company 's updated strategy f...,accord compani updat strategi year 2009 2012 b...


In [7]:
from sklearn.model_selection import train_test_split


# Renaming, Input -> X, Output -> y
X = df['text_clean']
y = df['sentiment']
# Split into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=df['sentiment']
                                                   )

# Model Training

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

corpus = X_train
# Initizalize the vectorizer with max nr words 
# and ngrams (1: single words, 2: two words in a row)
vectorizer_tfidf = TfidfVectorizer(max_features=15000, 
                                   ngram_range=(1,2))

# Fit the vectorizer to the training data
vectorizer_tfidf.fit(corpus)
TfidfVectorizer(max_features=15000, ngram_range=(1, 2))

In [9]:
classifier_tfidf = LogisticRegression()
model_tfidf = Pipeline([("vectorizer", vectorizer_tfidf), ("classifier", classifier_tfidf)])

start_time = datetime.now()
model_tfidf.fit(X_train, y_train)
end_time = datetime.now()

training_time_tfidf = (end_time - start_time).total_seconds()

In [10]:
predicted_train_tfidf = model_tfidf.predict(X_train)
accuracy_train_tfidf = accuracy_score(y_train, predicted_train_tfidf)
print('Accuracy Training data: {:.1%}'.format(accuracy_train_tfidf))

predicted_test_tfidf = model_tfidf.predict(X_test)
accuracy_test_tfidf = accuracy_score(y_test, predicted_test_tfidf)
accuracy_tfidf = accuracy_test_tfidf
print('Accuracy Test data: {:.1%}'.format(accuracy_test_tfidf))

print('Training time: {:.1f}s'.format(training_time_tfidf))

Accuracy Training data: 87.1%
Accuracy Test data: 72.4%
Training time: 1.0s


In [11]:
classes = ['Negative', 'Neutral', 'Positive']

print('Classes of the model: ', classes)
print(80*'-')
print('Shape of the coefficients of the model (categories x vocabulary size): ',classifier_tfidf.coef_.shape)
print(80*'-')
NN = 10
# Get the 10 (here: NN, which you can adjust yourself) ids of the words with highest weights per category
top_words = np.argsort(classifier_tfidf.coef_,axis=1)[:,-NN:]

# Get the vocabulary of the model (mapping of words to ids):
voc = vectorizer_tfidf.vocabulary_
# Get the inverse vocabulary to map the ids of the words to the words:
inv_voc = {v: k for k, v in voc.items()}

# Get for each category (=class) the top ten words
for n, w in enumerate(classes):
    t = f"{w}: "
    for i in range(NN):
        t += inv_voc[top_words[n,i]]
        if i!=NN:
            t+=', '
    print(t)
    print(80*'-')

Classes of the model:  ['Negative', 'Neutral', 'Positive']
--------------------------------------------------------------------------------
Shape of the coefficients of the model (categories x vocabulary size):  (3, 15000)
--------------------------------------------------------------------------------
Negative: cut, lay, result, compar, staff, half, drop, lower, fell, decreas, 
--------------------------------------------------------------------------------
Neutral: rang, sell, stake, design, busi, avail, disclos, valu, approxim, includ, 
--------------------------------------------------------------------------------
Positive: lead, said, posit, cooper, expand, sign, grew, improv, rose, increas, 
--------------------------------------------------------------------------------
