# SVM - Text Classification

based on: https://github.com/Gunjitbedi/Text-Classification

data: pos./neg. ratings of soundtracks of games

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ymlgpnx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ymlgpnx/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ymlgpnx/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ymlgpnx/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ymlgpnx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#Set Random seed
np.random.seed(500)

# Add the Data using pandas
Corpus = pd.read_csv("./corpus.csv", encoding='latin-1')

In [13]:
pd.options.display.max_colwidth=999
Corpus=Corpus.head(1000)
Corpus.head(2)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate video game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^,__label__2
1,"The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.",__label__2


In [14]:
# Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms

# Step - 1a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)

# Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [ entry.lower() for entry in Corpus['text']]
#[word.lower() for word in entry]

# Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

Corpus[['text_final']].head(2)

Unnamed: 0,text_final
0,"['stun', 'even', 'sound', 'track', 'beautiful', 'paint', 'senery', 'mind', 'well', 'would', 'recomend', 'even', 'people', 'hate', 'video', 'game', 'music', 'play', 'game', 'chrono', 'cross', 'game', 'ever', 'play', 'best', 'music', 'back', 'away', 'crude', 'keyboarding', 'take', 'fresh', 'step', 'grate', 'guitar', 'soulful', 'orchestra', 'would', 'impress', 'anyone', 'care', 'listen']"
1,"['best', 'soundtrack', 'ever', 'anything', 'read', 'lot', 'review', 'say', 'best', 'soundtrack', 'figure', 'write', 'review', 'disagree', 'bit', 'opinino', 'yasunori', 'mitsuda', 'ultimate', 'masterpiece', 'music', 'timeless', 'listen', 'year', 'beauty', 'simply', 'refuse', 'price', 'tag', 'pretty', 'stagger', 'must', 'say', 'go', 'buy', 'cd', 'much', 'money', 'one', 'feel', 'would', 'worth', 'every', 'penny']"


In [15]:
# Step - 2: Split the model into Train and Test Data set
X_train, X_test, y_train, y_test = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

# Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
# __label__2 -> 1, __label__1 -> 0 
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)  
y_test = Encoder.fit_transform(y_test)

In [16]:
# Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

In [17]:
#df_idf = pd.DataFrame(Tfidf_vect.idf_, index=Tfidf_vect.get_feature_names(),columns=["idf_weights"]) 
#df_idf.sort_values(by='idf_weights', ascending=False)

# show Tfidf values for the words in the first row in X_Test
print(X_test.iloc[0])
df_idf = pd.DataFrame(Test_X_Tfidf[0].T.todense(), index=Tfidf_vect.get_feature_names_out(), columns=["idf_weights"]) 
df_idf.sort_values(by='idf_weights', ascending=False).T

['fun', 'listen', 'second', 'best', 'cash', 'money', 'album', 'ever', 'hear', 'next', 'juvenile', 'degreez', 'definetely', 'star', 'cd', 'hot', 'boy', 'put', 'album', 'like', 'really', 'strong', 'song', 'bunch', 'average', 'little', 'good', 'song', 'really', 'horrible', 'song', 'best', 'song', 'main', 'need', 'hot', 'girl', 'fire', 'respect', 'mind', 'tuesday', 'thursday', 'song', 'song', 'listen', 'every', 'morning', 'like', 'month', 'month', 'cd', 'cd', 'must', 'good', 'huh', 'ridin', 'another', 'real', 'tight', 'one', 'song', 'blend', 'rest', 'like', 'clear', 'tha', 'set', 'ya', 'dig', 'juvenile', 'help', 'bout', 'whatever', 'turk', 'think', 'cool', 'get', 'solo', 'song', 'hot', 'nice', 'along', 'rest', 'song', 'juvenile', 'turk', 'put', 'one', 'best', 'cd', 'also', 'best', 'group', 'cd', 'come', 'long', 'real', 'good']


Unnamed: 0,song,cd,juvenile,hot,best,turk,rest,month,album,listen,...,footage,foot,fooled,fool,food,follower,follow,folktale,folk,zune
idf_weights,0.536462,0.307567,0.260814,0.217634,0.202052,0.194551,0.14509,0.136509,0.135462,0.128195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Step - 5: Now we can run different algorithms to classify out data check for accuracy

# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  82.0


In [19]:
# 0=neg. rating, 1=pos. rating
new_entry = "The soundtrack is awesome" # try awesome, bad, excellent, etc
SVM.predict(Tfidf_vect.transform(pd.Series([str(word_tokenize(new_entry.lower()))])))[0]

1