In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import spacy
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score

In [2]:
#Data import 

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import bz2
train = bz2.BZ2File('../input/amazonreviews/train.ft.txt.bz2')
test = bz2.BZ2File('../input/amazonreviews/test.ft.txt.bz2')        

/kaggle/input/amazonreviews/test.ft.txt.bz2
/kaggle/input/amazonreviews/train.ft.txt.bz2


In [4]:
#Read text from dataset

training_data = train.readlines()
test_data = test.readlines()

training_data = [x.decode('utf-8') for x in training_data]
test_data = [x.decode('utf-8') for x in test_data]

In [42]:
#Sample data from training set
training_data[:2]

['__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n',
 "__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n"]

In [43]:
#Sample data from test set
test_data[:2]

['__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n',
 "__label__2 One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too 

In [8]:
#Extracting text labels
label_train = [0 if line.split()[0] == '__label__1' else 1 for line in training_data]
label_test = [0 if line.split()[0] == '__label__1' else 1 for line in test_data]

In [13]:
sns.set_style('whitegrid')

In [14]:
sns.countplot(label_train)

In [15]:
sns.countplot(label_test)

In [16]:
#Extracting text and removing labels
   
training_set = []

for line in training_data:
    training_set.append(' '.join(line.split()[1:]))
    
testing_set = []

for line in test_data:
    testing_set.append(' '.join(line.split()[1:]))    


In [44]:
#Sample data
training_set[:1]

['Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^']

In [21]:
#Text preprocessing

nlp = spacy.load('en_core_web_lg')
spacy_stopwords = nlp.Defaults.stop_words
nltk_stopwords = stopwords.words('english')
stopwords_list = list(spacy_stopwords) + nltk_stopwords
punct = string.punctuation


In [22]:
#Function to tokenize,lemmatize,remove stopwords & punctuation

def text_processing(data):

    #Tokenization
    token_train = list(map(str.split,data))
    length = np.cumsum([0] + list(map(len, token_train)))
        
    all_words = [item for sublist in token_train for item in sublist]
    doc = spacy.tokens.Doc(nlp.vocab, words=all_words)
    
    #Lemmatization
    token_lemma = []

    for i in range(1,len(length)):
        slice_doc = doc[length[i-1]:length[i]]
        token_lemma.append([token.lemma_ for token in slice_doc])
    
    #Remove Punctuation
    token_no_punc = []
    for line in token_lemma:
        inner_token = []
        for token in line:
            inner_token.append(''.join(char for char in token if char not in punct))
        token_no_punc.append(inner_token)
        
    #Remove Stopwords
    token_no_stop = []

    for line in token_no_punc:
        token_no_stop.append([word for word in line if word if not word in stopwords_list])
    
    clean_data = [' '.join(line) for line in token_no_stop]
    return clean_data

    

In [23]:
#Preprocesing Training set
train_process = text_processing(training_set[:25000])

In [24]:
#Preprocesing Test set
test_process = text_processing(testing_set[:25000])

In [25]:
#TF-IDF 
tfidf = TfidfVectorizer(min_df=2,max_df=0.9)
train_dtm = tfidf.fit_transform(train_process)
test_dtm = tfidf.transform(test_process)

In [28]:
train_dtm

<25000x24754 sparse matrix of type '<class 'numpy.float64'>'
	with 781128 stored elements in Compressed Sparse Row format>

In [30]:
#Topic Modeling 

nmf = NMF(n_components=12)
nmf.fit(train_dtm)

In [33]:
for index,topic in enumerate(nmf.components_):
    print(f"Top 10 words for topic # {index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 20 words for topic # 0
['man', 'line', 'is', 'little', 'to', 'way', 'author', 'life', 'write', 'interest', 'series', 'in', 'of', 'end', 'and', 'novel', 'plot', 'character', 'story', 'the']


Top 20 words for topic # 1
['finish', 'lot', 'want', 'help', 'life', 'books', 'learn', 'people', 'reader', 'think', 'interest', 'understand', 'information', 'find', 'page', 'recommend', 'write', 'author', 'this', 'book']


Top 20 words for topic # 2
['actor', 'plot', 'end', 'comedy', 'horror', 'special', 'stupid', 'scene', 'action', 'seen', 'enjoy', 'act', 'wrong', 'movies', 'batman', 'bad', 'this', 'funny', 'watch', 'movie']


Top 20 words for topic # 3
['day', 'send', 'the', 'recommend', 'look', 'service', 'hair', 'quality', 'fit', 'ship', 'return', 'price', 'size', 'amazon', 'purchase', 'receive', 'use', 'item', 'order', 'product']


Top 20 words for topic # 4
['cruisers', 'these', 'rash', 'wet', 'son', 'they', 'we', 'brand', 'size', 'huggies', 'night', 'pamper', 'swaddlers', 'use', 'leak', 

In [None]:
#Text Classification using SVC

svc = LinearSVC()

svc.fit(train_dtm,label_train[:25000])


svc_predict = svc.predict(test_dtm)

print(f"The accuracy of SVC is {accuracy_score(label_test[:25000],svc_predict)*100}%")

In [None]:
#Text Classification using Naive Bayes


mnb_model = MultinomialNB()

mnb_model.fit(train_dtm,label_train[:25000])


mnb_predict = mnb_model.predict(test_dtm)

print(f"The accuracy of Naive Bayes is {accuracy_score(label_test[:25000],mnb_predict)*100}%")


In [34]:
#Sentiment analysis

vader = SentimentIntensityAnalyzer()

vader_label = []

def sentiment_analyser(data):
    for text in data:
        vader_label.append(1 if vader.polarity_scores(text)['compound'] >=0.5 else 0)

sentiment_analyser(training_set[:10000])    



In [38]:
#Accuracy using Vader Sentiment Analyzer
print(f"The accuracy of Vader Sentiment Analyzer is {accuracy_score(label_train[:10000],vader_label)*100}%")

The accuracy of Vader Sentiment Analyzer is 73.89%
