# Mobile App Review Rating Predictor
## Making a predictive model based on 140,000+ app reviews from the Google Play Store from 10 different categories

#### Imports

In [66]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import joblib

#### Loading the dataset - trying another type of file (.tsv)
##### reviews.csv == reviews.tsv

In [67]:
df = pd.read_csv('reviews.tsv', delimiter='\t')

#### Drop null values

In [68]:
# Drop null values
df = df.dropna(subset=['content'])

#### Reset dataframe indexes for the future loops to work correctly

In [69]:
df = df.reset_index()

#### Cleaning the reviews and creating a corpus ready for vectorization

In [72]:
corpus = []

# Looping through all 141460 rows in the DataFrame
for i in range(0, 141460)
    # Removing the special character from the reviews and replacing it with space character
    review = re.sub(pattern='[^a-zA-Z]', repl=' ', string=df['content'][i])

    # Converting the review into lower case character
    review = review.lower()

    # Tokenizing the review by words
    review_words = review.split()

    # Removing the stop words using nltk stopwords - remove useless English words like 'the', 'a', 'if', 'be' providing no meaning to understand of the meaning of the review
    review_words = [word for word in review_words if not word in set(
        stopwords.words('english'))]

    # Stemming the words - reduces words like “retrieval”, “retrieved”, “retrieves” to the stem “retrieve”
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review_words]

    # Joining the stemmed words
    review = ' '.join(review)

    # Creating a corpus
    corpus.append(review)

#### Creating Bag of Words model

In [73]:
cv = CountVectorizer(max_features=160)
X = cv.fit_transform(corpus).toarray()
y = df.score #prediction on the review rating

#### Creating a pickle file for the CountVectorizer model

In [82]:
joblib.dump(cv, "assets/cv.pkl")

['assets/cv1.pkl']

#### Model Building

In [75]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

#### Fitting Naive Bayes to the Training set

In [76]:
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.2)

#### Creating a pickle file for the Multinomial Naive Bayes model

In [77]:
joblib.dump(classifier, "assets/model.pkl")

['model.pkl']