In [66]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [131]:
corpus = pd.read_csv("corpus.csv", encoding="latin-1")
corpus['text']

0        Stuning even for the non-gamer: This sound tr...
1        The best soundtrack ever to anything.: I'm re...
2        Amazing!: This soundtrack is my favorite musi...
3        Excellent Soundtrack: I truly like this sound...
4        Remember, Pull Your Jaw Off The Floor After H...
                              ...                        
9995     A revelation of life in small town America in...
9996     Great biography of a very interesting journal...
9997     Interesting Subject; Poor Presentation: You'd...
9998     Don't buy: The box looked used and it is obvi...
9999     Beautiful Pen and Fast Delivery.: The pen was...
Name: text, Length: 10000, dtype: object

In [132]:
corpus

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2
...,...,...
9995,A revelation of life in small town America in...,__label__2
9996,Great biography of a very interesting journal...,__label__2
9997,Interesting Subject; Poor Presentation: You'd...,__label__1
9998,Don't buy: The box looked used and it is obvi...,__label__1


In [70]:
corpus = pd.read_csv("corpus.csv", encoding="latin-1")
corpus['text'].dropna(inplace=True)
corpus['text'] = [entry.lower() for entry in corpus['text']]
corpus['text'] = [word_tokenize(entry) for entry in corpus['text']]

In [72]:
np.random.seed(500)
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(corpus['text']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()

    for word, tag in pos_tag(entry):
        if word not in stopwords.words("english") and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)
    corpus.loc[index, "text_final"] = str(final_words)


In [88]:
corpus['text_final']

0       ['stun', 'even', 'sound', 'track', 'beautiful'...
1       ['best', 'soundtrack', 'ever', 'anything', 're...
2       ['amaze', 'soundtrack', 'favorite', 'music', '...
3       ['excellent', 'soundtrack', 'truly', 'like', '...
4       ['remember', 'pull', 'jaw', 'floor', 'hear', '...
                              ...                        
9995    ['revelation', 'life', 'small', 'town', 'ameri...
9996    ['great', 'biography', 'interesting', 'journal...
9997    ['interest', 'subject', 'poor', 'presentation'...
9998    ['buy', 'box', 'look', 'use', 'obviously', 'ne...
9999    ['beautiful', 'pen', 'fast', 'delivery', 'pen'...
Name: text_final, Length: 10000, dtype: object

In [105]:
corpus['label']

0       __label__2 
1       __label__2 
2       __label__2 
3       __label__2 
4       __label__2 
           ...     
9995    __label__2 
9996    __label__2 
9997    __label__1 
9998    __label__1 
9999    __label__2 
Name: label, Length: 10000, dtype: object

In [121]:
# training the data set into two sets: training and testing set
# out of the entire data set, 70% is training set and 30% is testing set
# the training set X (70%) is associated with the training test Y for identification / label purpose
# the testing set X (30%) is associated with the testing set Y for identification / label purpose
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(corpus['text_final'],corpus['label'],test_size=0.3)


In [128]:
Train_X

3690    ['best', 'show', 'ever', 'love', 'three', 'com...
2577    ['good', 'track', 'great', 'trackes', 'shame',...
1431    ['incredible', 'young', 'female', 'singer', 'q...
7482    ['horrible', 'man', 'game', 'horrible', 'mean'...
5456    ['great', 'movie', 'mind', 'setting', 'movie',...
                              ...                        
5874    ['vein', 'chocolat', 'delightful', 'work', 'ar...
2314    ['interesting', 'interesting', 'idea', 'operat...
3806    ['verrrrryyyyyyy', 'good', 'movie', 'freaking'...
6471    ['overall', 'purchase', 'sure', 'think', 'movi...
4600    ['catastrophe', 'cave', 'advetising', 'box', '...
Name: text_final, Length: 7000, dtype: object

In [108]:
len(Train_Y)

7000

In [110]:
len(Test_X)

3000

In [109]:
len(Test_Y)

3000

In [122]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [93]:
Train_Y

array([0, 0, 1, ..., 1, 0, 0])

In [94]:
Test_Y

array([0, 1, 1, ..., 1, 0, 0])

In [125]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [129]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  83.33333333333334


In [130]:
predictions_NB

array([0, 0, 0, ..., 1, 0, 0])

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

In [23]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [24]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

Naive Bayes Accuracy Score ->  83.33333333333334


SVM Accuracy Score ->  84.66666666666667


In [99]:
X = list(range(10))
print(X)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [100]:
y = [x * x for x in X]

In [104]:
import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.3)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[2, 7, 5]
[1, 9, 4, 6, 0, 3, 8]
[4, 49, 25]
[1, 81, 16, 36, 0, 9, 64]


In [117]:
example = ['strong', 'weak', 'normal', 'weak', 'strong']
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit_transform(example)


array([1, 2, 0, 2, 1])