In [1]:
import numpy as np 
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
import re

In [2]:
# Importing the dataset

test_raw = pd.read_csv('input/test.csv')

test_raw # 60427 reviews imported

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg model..."
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...
4,5,it's sooooo cute! i like playing with the glit...
...,...,...
60422,60423,Product has been succesfully ordered and shipp...
60423,60424,Opening time a little scared. Fear dalemnya de...
60424,60425,The product quality is excellent. The origina...
60425,60426,They 're holding up REALLY well also .


In [3]:
# Extract the labels & sentiments from the training data

test_features = test_raw.iloc[:, 1].values

# Check features
i = 0

while i < 5:
    print(test_features[i], "\n")
    i += 1

Great danger, cool, motif and cantik2 jg models. Delivery cepet. Tp packing less okay krn only wear clear plastic nerawang klihtan contents jd 

One of the shades don't fit well 

Very comfortable 

Fast delivery. Product expiry is on Dec 2022. Product wrap properly. No damage on the item. 

it's sooooo cute! i like playing with the glitters better than browsing on my phone now. item was also deliered earlier than i expected. thank you seller! may you have more buyers to come. 😊😊😊 



In [4]:
import spacy

nlp_en = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])

In [5]:
processed_test_features = []

for sentence in range(0, len(test_features)):
    
    # Remove all special characters
    processed_feature = re.sub(r'\W', ' ', str(test_features[sentence]))
    
    # Remove all words that include digits / numbers
    processed_feature = re.sub(r'\w*\d\w*', ' ', processed_feature)
    
    # Remove all single characters
    processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    
    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    
    # NEW: Applying English - spaCy Lemmatization
    processed_feature = nlp_en(processed_feature)
    processed_feature = " ".join([token.lemma_ for token in processed_feature])
    
    # NEW: Applying Bahasa Indonesia - spaCy Lemmatization
    # processed_feature = nlp_id(processed_feature)
    # processed_feature = " ".join([token.lemma_ for token in processed_feature])
    
    # Substituing multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags = re.I)
    
    # Converting to lowercase
    processed_featured = processed_feature.lower()
    
    # Append cleaned review to processed list
    processed_test_features.append(processed_feature)

In [6]:
# Checking if labels / reviews have been cleaned up
i = 0

while i < 5:
    print(processed_test_features[i], "\n")
    i += 1

great danger cool motif and jg model Delivery cepet Tp packing less okay krn only wear clear plastic nerawang klihtan content jd 

one of the shade don fit well 

very comfortable 

fast delivery product expiry be on Dec product wrap properly no damage on the item 

-PRON- sooooo cute like play with the glitter well than browse on -PRON- phone now item be also deliere early than expect thank -PRON- seller may -PRON- have more buyer to come  



In [7]:
# Load Indonesian & English library from SpaCy
from spacy.lang.id.stop_words import STOP_WORDS as id_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

# Compiling stopwords list
final_stopwords_list = list(id_stop) + list(en_stop)

In [14]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 3500, min_df = 7, max_df = 0.75, stop_words = final_stopwords_list)
processed_test_features1 = vectorizer.fit_transform(processed_test_features).toarray()

vectorizer2 = TfidfVectorizer(max_features = 2500, min_df = 7, max_df = 0.75, stop_words = final_stopwords_list)
processed_test_features2 = vectorizer2.fit_transform(processed_test_features).toarray()



In [9]:
# UnPickle our earlier trained model

# New Model 1 : Accuracy of 45.17% on y-test

filename_01 = "rfc_new_model_01.pkl"
with open(filename_01, 'rb') as file1:
    pickle_rfc_01 = pickle.load(file1)
    
# New Model 2 : Accuracy of 44.19% on y-test
filename_02 = "rfc_new_model_02.pkl"
with open(filename_02, 'rb') as file2:
    pickle_rfc_02 = pickle.load(file2)

In [10]:
# Time to predict!

predictions_01 = pickle_rfc_01.predict(processed_test_features1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.2min finished


In [15]:
predictions_02 = pickle_rfc_02.predict(processed_test_features2)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    6.6s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:   15.8s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:   17.3s finished


In [16]:
# Checking the prediction output accurate & is between 1 to 5
# Check length to ensure input features = output predicted labels

print(min(predictions_01), min(predictions_02))
print(max(predictions_01), max(predictions_02))
print(len(predictions_01), len(predictions_02), "\nOriginal:", len(test_raw))

1 1
5 5
60427 60427 
Original: 60427


In [17]:
# Creating copies for ease of reusibility later

submission_03 = test_raw.copy()

submission_04 = test_raw.copy()

# Checking if copy is successful
submission_03.head()

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg model..."
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...
4,5,it's sooooo cute! i like playing with the glit...


In [18]:
# Attaching the predictions to it's review_id
submission_03['rating'] = predictions_01
submission_04['rating'] = predictions_02

# Checking if columns are correctly attached
submission_03.head()

Unnamed: 0,review_id,review,rating
0,1,"Great danger, cool, motif and cantik2 jg model...",5
1,2,One of the shades don't fit well,1
2,3,Very comfortable,5
3,4,Fast delivery. Product expiry is on Dec 2022. ...,5
4,5,it's sooooo cute! i like playing with the glit...,1


In [19]:
# Removing the review column

submission_03 = submission_03.drop("review", axis = 1)
submission_04 = submission_04.drop("review", axis = 1)

# Checks if column has been dropped correctly
submission_03.head()

Unnamed: 0,review_id,rating
0,1,5
1,2,1
2,3,5
3,4,5
4,5,1


In [20]:
# Last check before saving to csv.

print("submission doc 3:", submission_03.shape)
print("submission doc 4:", submission_04.shape)

submission doc 3: (60427, 2)
submission doc 4: (60427, 2)


In [22]:
# Output results in csv format

submission_03.to_csv("shopeecodeleague_SentimentAnalysis_TeamJnny_03.csv", index = False, encoding = 'utf8')
submission_04.to_csv("shopeecodeleague_SentimentAnalysis_TeamJnny_04.csv", index = False, encoding = 'utf8')