In [1]:
"""
Created on Tue Apr  2 17:03:53 2019

@author: jesskim
"""

import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer # TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.tokenize.api import TokenizerI
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
dataset = pd.read_csv('googleplaystore_user_reviews.csv')
dataset = dataset.dropna()
dataset['Sentiment'].value_counts()

Positive    23998
Negative     8271
Neutral      5158
Name: Sentiment, dtype: int64

## Preprocess

In [3]:
dataset.Sentiment[dataset.Sentiment =='Positive'] = 0
dataset.Sentiment[dataset.Sentiment =='Neutral'] = 1
dataset.Sentiment[dataset.Sentiment =='Negative'] = 2

dataset['Sentiment'] = dataset['Sentiment'].astype('int')
#dataset["index"] = range(0,37427)
dataset['index'] = range(dataset.shape[0])
dataset = dataset.set_index("index")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
# Sort by most positive reviews
ranked_app = dataset.groupby('App').mean()['Sentiment_Polarity'].sort_values(ascending=False)
##ranked_app

In [6]:
stemmer = PorterStemmer() 

def clean_text(review):
    review = re.sub('[^a-zA-Z]', ' ', review) 
    review = re.sub('[/(){}\[\]\|@!,;]', ' ', review)
    review = re.sub('[^0-9a-z #+_♥️]', ' ', review) #Remove bad symbols
    review = review.lower()
    review = review.split()
   
    review = [stemmer.stem(token) for token in review if token not in set(stopwords.words('english'))]
    review =' '.join(review)
    
    return review

corpus = dataset.Translated_Review.apply(clean_text).tolist()

## Feature Extraction

#### Feature Extraction Method 1: TF-IDF

In [7]:
#cv = TfidfVectorizer(max_features = 16000)
cv = TfidfVectorizer(max_features = 32000, max_df=0.8, min_df=0.0001, ngram_range=[1, 2]) 
X_tfidf = cv.fit_transform(corpus).toarray()
y_tfidf = dataset.iloc[:, 2].values

#### Feature Extraction Method 2: Doc2Vec

In [9]:
"""
Download Doc2Vec model from https://ibm.ent.box.com/s/3f160t4xpuya9an935k84ig465gvymm2
"""

model_path = '/Users/jesskim/Downloads/enwiki_dbow/doc2vec.bin'

tokenized_corpus = [word_tokenize(doc) for doc in corpus]
dv_model = Doc2Vec.load(model_path)

X_dv = np.array([dv_model.infer_vector(tokenized_doc) for tokenized_doc in tokenized_corpus])
y_dv = dataset.iloc[:, 2].values

In [10]:
def extract_features(mode):
    if mode == 'tfidf':
        return X_tfidf, y_tfidf
    elif mode == 'doc2vec':
        return X_dv, y_dv

In [11]:
#CHANGE this to change method for feature extraction
X, y = extract_features('doc2vec')

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Classifier

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

CLASSIFIERS = {
    'gaussian': GaussianNB(),
    'rf': RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0),
    'dt': DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
    'svc': SVC(kernel = 'linear', random_state = 0)
}

In [None]:
classifier = CLASSIFIERS['dt']
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)


In [None]:
np.trace(cm)