Imports

In [1]:
import json

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
RANDOM_STATE = 4
N_JOBS = 8

In [3]:
text_label = 'stopwords_removal_nltk'
label_name = 'label'
senti_label = 'sentiScore'
features = [text_label, senti_label, label_name]


Parameters to tune for SVC

In [4]:
# C
C = [.0001, .001, .1, 1]

# gamma
gamma = [.0001, .001, .01, 1, 10, 100, 'auto']

# degree
degree = list(range(1, 5))

# kernel
kernel = ['rbf', 'poly']

# probability
probability = [True]

# create a random grid
random_grid_svc = {'C': C,
                   'kernel': kernel,
                   'gamma': gamma,
                   'degree': degree,
                   'probability': probability
                   }


Parameters to tune for Random Forest

In [5]:
random_grid_forest = {'bootstrap': [True, False],
                      'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                      'max_features': ['log2', 'sqrt'],
                      'min_samples_leaf': [1, 2, 4],
                      'min_samples_split': [2, 5, 10],
                      'n_jobs': [N_JOBS],
                      'random_state': [RANDOM_STATE],
                      'criterion': ['gini', 'entropy', 'log_loss'],
                      'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
                      }


Tfidf parameters

In [6]:
NGRAM_RANGE = (1, 2)
MIN_DF = .01  # ignore terms that appear in less than 1% of the documents
MAX_DF = .8  # ignore terms that appear in more than 80% of the documents
MAX_FEATURES = 300

#### Data manipulation

In [7]:
def load_file(file_path):

    """

    :param file_path: path to the json file

    :return: an array in which each entry is tuple [review, classification label]

    """

    with open(file_path) as json_file:
        raw_data = json.load(json_file)
    return convert_data(raw_data)


def convert_data(raw_data):
    extracted = [[elem[feat] for feat in features] for elem in raw_data]
    return pd.DataFrame(extracted, columns=features)

In [8]:
data_bug = load_file('Bug_tt.json')
data_feat = load_file('Feature_tt.json')
data_rating = load_file('Rating_tt.json')
data_ux = load_file('UserExperience_tt.json')

In [9]:
def transform_label(data: pd.DataFrame, class_name, inplace=False):
    new_label = np.where(data[label_name] == class_name, 1, 0)
    new_data = data if inplace else data.copy()
    new_data[label_name] = new_label
    return new_data

In [10]:
transform_label(data_bug, 'Bug', inplace=True)
transform_label(data_feat, 'Feature', inplace=True)
transform_label(data_rating, 'Rating', inplace=True)
transform_label(data_ux, 'UserExperience', inplace=True)


Unnamed: 0,stopwords_removal_nltk,sentiScore,label
0,"still need check each, think need improve tech...",2,1
1,app helpful!!!! truly amazing!!!! paid bills,4,1
2,awesome!!!,4,1
3,love app takes less ten seconds let know batte...,3,1
4,it's app pretty much everything making list of...,3,1
...,...,...,...
735,crashes lenovo tablet app keeps crashing loses...,-2,0
736,terrible took forever download,-4,0
737,norefund wtf seems like nice game working sgs2...,-3,0
738,need speed wanted would good game would load i...,2,0


In [11]:
X_train_bug, X_test_bug, y_train_bug, y_test_bug = train_test_split(
    data_bug[[text_label, senti_label]], data_bug[label_name], test_size=.15, random_state=RANDOM_STATE)
X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(
    data_feat[[text_label, senti_label]], data_feat[label_name], test_size=.15, random_state=RANDOM_STATE)
X_train_rating, X_test_rating, y_train_rating, y_test_rating = train_test_split(
    data_rating[[text_label, senti_label]], data_rating[label_name], test_size=.15, random_state=RANDOM_STATE)
X_train_ux, X_test_ux, y_train_ux, y_test_ux = train_test_split(
    data_ux[[text_label, senti_label]], data_ux[label_name], test_size=.15, random_state=RANDOM_STATE)


## Native Mutliclass Algorithm (Random Forest)

In [12]:
def vectorized_text(train, test):
    tfidf = TfidfVectorizer(encoding='utf-8',
                            ngram_range=NGRAM_RANGE,
                            stop_words=None,
                            lowercase=False,
                            max_df=MAX_DF,
                            min_df=MIN_DF,
                            max_features=MAX_FEATURES,
                            norm='l2',
                            sublinear_tf=True)
    features_train = tfidf.fit_transform(train[text_label]).toarray()
    features_test = tfidf.transform(test[text_label]).toarray()
    features_train = np.append(
        features_train, train[[senti_label]].to_numpy(), axis=1)
    features_test = np.append(
        features_test, test[[senti_label]].to_numpy(), axis=1)
    return features_train, features_test


def drop_non_class(X, y):
    df = X.copy()
    df[label_name] = y
    df = df.loc[y == 1, :]
    return df[X.columns], df[label_name]


def combine_datasets(X_bug, y_bug, X_feat, y_feat, X_rating, y_rating, X_ux, y_ux):
    X_bug, y_bug = drop_non_class(X_bug, y_bug)
    X_feat, y_feat = drop_non_class(X_feat, y_feat)
    X_rating, y_rating = drop_non_class(X_rating, y_rating)
    X_ux, y_ux = drop_non_class(X_ux, y_ux)
    X = pd.concat([X_bug, X_feat, X_rating, X_ux], axis=0)
    y = np.concatenate([np.zeros_like(y_bug),
                        np.ones_like(y_feat),
                        np.ones_like(y_rating) * 2,
                        np.ones_like(y_ux) * 3])
    return X, y


In [13]:
X_train_combined, y_train_combined = combine_datasets(
    X_train_bug, y_train_bug, X_train_feat, y_train_feat, X_train_rating, y_train_rating, X_train_ux, y_train_ux)
X_train_combined

Unnamed: 0,stopwords_removal_nltk,sentiScore
65,love nook ipad love many books always me somet...,3
276,crashes update,-1
70,please fix it's often used function,2
337,"really wanted note system could trust, evernot...",-1
62,important add landscape sheet orientation word...,-1
...,...,...
109,love photos backed automatically keep phone,3
58,"aside occasional glitch, pretty good awesome g...",-3
360,everything wanted craigslist never thought of!...,3
174,exactly says tin effecient enough check respon...,-1


In [14]:
X_test_combined, y_test_combined = combine_datasets(
    X_test_bug, y_test_bug, X_test_feat, y_test_feat, X_test_rating, y_test_rating, X_test_ux, y_test_ux)
X_test_combined

Unnamed: 0,stopwords_removal_nltk,sentiScore
104,"used used app, it's pain find hotels searching...",-4
83,need improve functionality app can't vote revi...,2
17,liked much upgrade pdfs (divisions search) how...,3
139,finebt cud b improved!! d popular app smartphn...,3
303,update makes harder slower use icons different...,-1
...,...,...
253,far best organizational app around use dozens ...,2
249,best! really like features especially stickers ?,3
146,enjoy sending verses friends iphone,3
358,"good ,i like it accurate high tech",2


In [15]:
X_train_combined_vec_senti, X_test_combined_vec_senti = vectorized_text(X_train_combined, X_test_combined)
X_train_combined_vec_senti.shape

(1183, 301)

In [16]:
clf_forest = RandomForestClassifier()
random_cv_forest = RandomizedSearchCV(estimator=clf_forest,
                                      param_distributions=random_grid_forest,
                                      n_iter=50,
                                      scoring='accuracy',
                                      cv=3,
                                      verbose=1,
                                      n_jobs=N_JOBS,
                                      random_state=RANDOM_STATE)
random_cv_forest.fit(X_train_combined_vec_senti, y_train_combined)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [17]:
predicted_combined = random_cv_forest.predict(X_test_combined_vec_senti)
print(f'''Testing Forest Multiclass
Accuracy: {accuracy_score(y_test_combined, predicted_combined)}
F1 Score: {f1_score(y_test_combined, predicted_combined, average=None)}
Precision: {precision_score(y_test_combined, predicted_combined, average=None)}
Recall: {recall_score(y_test_combined, predicted_combined, average=None)}
''')

Testing Forest Multiclass
Accuracy: 0.5765765765765766
F1 Score: [0.60465116 0.31428571 0.63492063 0.63865546]
Precision: [0.55714286 0.44       0.59701493 0.63333333]
Recall: [0.66101695 0.24444444 0.6779661  0.6440678 ]



## One vs Rest method

An implementation of one-vs-rest classifier

In [18]:
class OVRClassifier:
    def __init__(self, clf_type='svc') -> None:
        self.clf_type = clf_type
        self.bug_scaler = StandardScaler()
        self.bug_clf = self._make_clf()
        self.bug_vectorizer = self._make_tfidf()
        self.feat_scaler = StandardScaler()
        self.feat_clf = self._make_clf()
        self.feat_vectorizer = self._make_tfidf()
        self.rating_scaler = StandardScaler()
        self.rating_clf = self._make_clf()
        self.rating_vectorizer = self._make_tfidf()
        self.ux_scaler = StandardScaler()
        self.ux_clf = self._make_clf()
        self.ux_vectorizer = self._make_tfidf()
        self.random_grid = {'svc': random_grid_svc,
                            'forest': random_grid_forest}

    def _make_tfidf(self, ngram_range=NGRAM_RANGE, max_df=MAX_DF, min_df=MIN_DF, max_features=MAX_FEATURES):
        return TfidfVectorizer(encoding='utf-8',
                               ngram_range=ngram_range,
                               stop_words=None,
                               lowercase=False,
                               max_df=max_df,
                               min_df=min_df,
                               max_features=max_features,
                               norm='l2',
                               sublinear_tf=True)
    
    def _make_random_searcher_svc(self, clf):
        return RandomizedSearchCV(estimator=clf,
                                  param_distributions=self.random_grid[self.clf_type],
                                  n_iter=50,
                                  scoring='accuracy',
                                  cv=3,
                                  verbose=1,
                                  n_jobs=N_JOBS,
                                  random_state=RANDOM_STATE)
    
    def _make_clf(self):
        if self.clf_type == 'svc':
            return SVC()
        elif self.clf_type == 'forest':
            return RandomForestClassifier()
        raise ValueError(f'Unknown classifier type: {self.clf_type}')

    def _fit(self, X, y, vectorizer, scaler, clf):
        X_vec = vectorizer.fit_transform(X[text_label]).toarray()
        X_vec_senti = np.append(
            X_vec, X[[senti_label]].to_numpy(), axis=1)
        X_normalized = scaler.fit_transform(X_vec_senti)
        random_cv = self._make_random_searcher_svc(clf)
        random_cv.fit(X_normalized, y)
        print(f'Training accuracy: {random_cv.best_score_}')
        return random_cv.best_estimator_

    def _fit_bug(self, X, y):
        self.bug_clf = self._fit(X, y, self.bug_vectorizer,
                                 self.bug_scaler, self.bug_clf)

    def _fit_feat(self, X, y):
        self.feat_clf = self._fit(X, y, self.feat_vectorizer,
                                  self.feat_scaler, self.feat_clf)

    def _fit_rating(self, X, y):
        self.rating_clf = self._fit(X, y, self.rating_vectorizer,
                                    self.rating_scaler, self.rating_clf)

    def _fit_ux(self, X, y):
        self.ux_clf = self._fit(X, y, self.ux_vectorizer,
                                self.ux_scaler, self.ux_clf)

    def _predict_proba(self, X, vectorizer, scaler, clf):
        X_vec = vectorizer.transform(X[text_label]).toarray()
        X_vec_senti = np.append(X_vec, X[[senti_label]].to_numpy(), axis=1)
        X_normalized = scaler.transform(X_vec_senti)
        return clf.predict_proba(X_normalized)

    def _predict(self, X, vectorizer, scaler, clf):
        predicted_proba = self._predict_proba(X, vectorizer, scaler, clf)
        return np.argmax(predicted_proba, axis=1)
    
    def _predict_proba_bug(self, X):
        return self._predict_proba(X, self.bug_vectorizer, self.bug_scaler, self.bug_clf)

    def _predict_proba_feat(self, X):
        return self._predict_proba(X, self.feat_vectorizer, self.feat_scaler, self.feat_clf)

    def _predict_proba_rating(self, X):
        return self._predict_proba(X, self.rating_vectorizer, self.rating_scaler, self.rating_clf)

    def _predict_proba_ux(self, X):
        return self._predict_proba(X, self.ux_vectorizer, self.ux_scaler, self.ux_clf)

    def _predict_bug(self, X):
        return self._predict(X, self.bug_vectorizer, self.bug_scaler, self.bug_clf)

    def fit(self, X_bug, y_bug, X_feat, y_feat, X_rating, y_rating, X_ux, y_ux):
        self._fit_bug(X_bug, y_bug)
        self._fit_feat(X_feat, y_feat)
        self._fit_rating(X_rating, y_rating)
        self._fit_ux(X_ux, y_ux)

    def predict(self, X):
        bug_proba = self._predict_proba_bug(X)[:, 1]
        feat_proba = self._predict_proba_feat(X)[:, 1]
        rating_proba = self._predict_proba_rating(X)[:, 1]
        ux_proba = self._predict_proba_ux(X)[:, 1]
        return np.argmax([bug_proba, feat_proba, rating_proba, ux_proba], axis=0)


In [19]:
ova_svc = OVRClassifier()
ova_svc.fit(X_train_bug, y_train_bug, X_train_feat, y_train_feat,
            X_train_rating, y_train_rating, X_train_ux, y_train_ux)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.7312903470798208
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.7532348111658456
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.7630819472924736
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.7790081263765475


In [20]:
ova_forest = OVRClassifier('forest')
ova_forest.fit(X_train_bug, y_train_bug, X_train_feat, y_train_feat,
               X_train_rating, y_train_rating, X_train_ux, y_train_ux)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.7503227766385662
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.8048823207443897
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.7694615326194274
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Training accuracy: 0.7869294448241817


In [21]:
pred_test_svc = ova_svc.predict(X_test_combined)
print(f'''Testing SVC one-vs-rest
Accuracy: {accuracy_score(y_test_combined, pred_test_svc)}
F1 Score: {f1_score(y_test_combined, pred_test_svc, average=None)}
Precision: {precision_score(y_test_combined, pred_test_svc, average=None)}
Recall: {recall_score(y_test_combined, pred_test_svc, average=None)}
''')


Testing SVC one-vs-rest
Accuracy: 0.5585585585585585
F1 Score: [0.55045872 0.51546392 0.57142857 0.58928571]
Precision: [0.6        0.48076923 0.53731343 0.62264151]
Recall: [0.50847458 0.55555556 0.61016949 0.55932203]



In [22]:
pred_test_forest = ova_forest.predict(X_test_combined)
print(f'''Testing Random Forest one-vs-rest
Accuracy: {accuracy_score(y_test_combined, pred_test_forest)}
F1 Score: {f1_score(y_test_combined, pred_test_forest, average=None)}
Precision: {precision_score(y_test_combined, pred_test_forest, average=None)}
Recall: {recall_score(y_test_combined, pred_test_forest, average=None)}
''')


Testing Random Forest one-vs-rest
Accuracy: 0.527027027027027
F1 Score: [0.3956044  0.48739496 0.61538462 0.57692308]
Precision: [0.5625     0.39189189 0.56338028 0.66666667]
Recall: [0.30508475 0.64444444 0.6779661  0.50847458]

