Imports

In [79]:
import json

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [14]:
text_label = 'stopwords_removal_nltk'
label_name = 'label'
senti_label = 'sentiScore'
features = [text_label, senti_label, label_name]


In [4]:
def load_file(file_path):

    """

    :param file_path: path to the json file

    :return: an array in which each entry is tuple [review, classification label]

    """

    with open(file_path) as json_file:
        raw_data = json.load(json_file)
    return convert_data(raw_data)


def convert_data(raw_data):
    extracted = [[elem[feat] for feat in features] for elem in raw_data]
    return pd.DataFrame(extracted, columns=features)

In [6]:
data_bug = load_file('Bug_tt.json')
data_feat = load_file('Feature_tt.json')
data_rating = load_file('Rating_tt.json')
data_ux = load_file('UserExperience_tt.json')

In [9]:
def transform_label(data: pd.DataFrame, label_name, inplace=False):
    new_label = np.where(data[label_name] == label_name, 1, 0)
    new_data = data if inplace else data.copy()
    new_data[label_name] = new_label
    return new_data

In [83]:
transform_label(data_bug, 'Bug', inplace=True)
transform_label(data_feat, 'Feature', inplace=True)
transform_label(data_rating, 'Rating', inplace=True)
transform_label(data_ux, 'UserExperience', inplace=True)


Unnamed: 0,stopwords_removal_nltk,sentiScore,label
0,"still need check each, think need improve tech...",2,1
1,app helpful!!!! truly amazing!!!! paid bills,4,1
2,awesome!!!,4,1
3,love app takes less ten seconds let know batte...,3,1
4,it's app pretty much everything making list of...,3,1
...,...,...,...
735,crashes lenovo tablet app keeps crashing loses...,-2,0
736,terrible took forever download,-4,0
737,norefund wtf seems like nice game working sgs2...,-3,0
738,need speed wanted would good game would load i...,2,0


In [84]:
X_train_bug, X_test_bug, y_train_bug, y_test_bug = train_test_split(
    data_bug[[text_label, senti_label]], data_bug[label_name], test_size=.15, random_state=42)
X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(
    data_feat[[text_label, senti_label]], data_feat[label_name], test_size=.15, random_state=42)
X_train_rating, X_test_rating, y_train_rating, y_test_rating = train_test_split(
    data_rating[[text_label, senti_label]], data_rating[label_name], test_size=.15, random_state=42)
X_train_ux, X_test_ux, y_train_ux, y_test_ux = train_test_split(
    data_ux[[text_label, senti_label]], data_ux[label_name], test_size=.15, random_state=42)


Tfidf Parameters

In [86]:
# Parameter election
NGRAM_RANGE = (1, 2)
MIN_DF = .01  # ignore terms that appear in less than 1% of the documents
MAX_DF = .8  # ignore terms that appear in more than 80% of the documents
MAX_FEATURES = 300

def vectorized_text(train, test):
    tfidf = TfidfVectorizer(encoding='utf-8',
                            ngram_range=NGRAM_RANGE,
                            stop_words=None,
                            lowercase=False,
                            max_df=MAX_DF,
                            min_df=MIN_DF,
                            max_features=MAX_FEATURES,
                            norm='l2',
                            sublinear_tf=True)
    features_train = tfidf.fit_transform(train).toarray()
    features_test = tfidf.transform(test).toarray()
    return features_train, features_test


In [91]:
X_train_vec_bug, X_test_vec_bug = vectorized_text(X_train_bug[text_label], X_test_bug[text_label])

In [92]:
X_train_bug_vec_senti = np.append(
    X_train_vec_bug, X_train_bug[[senti_label]].to_numpy(), axis=1)
X_test_bug_vec_senti = np.append(X_test_vec_bug, X_test_bug[[senti_label]].to_numpy(), axis=1)


In [93]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
clf.fit(X_train_bug_vec_senti, y_train_bug)
clf.predict_proba(X_test_bug_vec_senti)


array([[0.43021057, 0.56978943],
       [0.41888333, 0.58111667],
       [0.23149543, 0.76850457],
       [0.33602197, 0.66397803],
       [0.816044  , 0.183956  ],
       [0.94183234, 0.05816766],
       [0.33185109, 0.66814891],
       [0.65935388, 0.34064612],
       [0.28939056, 0.71060944],
       [0.22292645, 0.77707355],
       [0.24464754, 0.75535246],
       [0.97980484, 0.02019516],
       [0.66367196, 0.33632804],
       [0.90534764, 0.09465236],
       [0.91987781, 0.08012219],
       [0.82447043, 0.17552957],
       [0.60620484, 0.39379516],
       [0.14492329, 0.85507671],
       [0.17488222, 0.82511778],
       [0.20226863, 0.79773137],
       [0.93543808, 0.06456192],
       [0.2572643 , 0.7427357 ],
       [0.38319337, 0.61680663],
       [0.31948872, 0.68051128],
       [0.86817738, 0.13182262],
       [0.83570107, 0.16429893],
       [0.22692273, 0.77307727],
       [0.91646753, 0.08353247],
       [0.63491253, 0.36508747],
       [0.90207814, 0.09792186],
       [0.

In [120]:
class OVRClassifier:
    def __init__(self) -> None:
        self.bug_scaler = StandardScaler()
        self.bug_clf = SVC(gamma='auto', probability=True)
        self.bug_vectorizer = self._make_tfidf()
        self.feat_scaler = StandardScaler()
        self.feat_clf = SVC(gamma='auto', probability=True)
        self.feat_vectorizer = self._make_tfidf()
        self.rating_scaler = StandardScaler()
        self.rating_clf = SVC(gamma='auto', probability=True)
        self.rating_vectorizer = self._make_tfidf()
        self.ux_scaler = StandardScaler()
        self.ux_clf = SVC(gamma='auto', probability=True)
        self.ux_vectorizer = self._make_tfidf()

    def _make_tfidf(self, ngram_range=NGRAM_RANGE, max_df=MAX_DF, min_df=MIN_DF, max_features=MAX_FEATURES):
        return TfidfVectorizer(encoding='utf-8',
                               ngram_range=ngram_range,
                               stop_words=None,
                               lowercase=False,
                               max_df=max_df,
                               min_df=min_df,
                               max_features=max_features,
                               norm='l2',
                               sublinear_tf=True)

    def _fit(self, X, y, vectorizer, scaler, clf):
        X_vec = vectorizer.fit_transform(X[text_label]).toarray()
        X_vec_senti = np.append(
            X_vec, X[[senti_label]].to_numpy(), axis=1)
        X_normalized = scaler.fit_transform(X_vec_senti)
        clf.fit(X_normalized, y)
        print(f'Training accuracy: {clf.score(X_normalized, y)}')

    def _fit_bug(self, X, y):
        self._fit(X, y, self.bug_vectorizer, self.bug_scaler, self.bug_clf)

    def _fit_feat(self, X, y):
        self._fit(X, y, self.feat_vectorizer,
                  self.feat_scaler, self.feat_clf)

    def _fit_rating(self, X, y):
        self._fit(X, y, self.rating_vectorizer,
                  self.rating_scaler, self.rating_clf)

    def _fit_ux(self, X, y):
        self._fit(X, y, self.ux_vectorizer,
                  self.ux_scaler, self.ux_clf)

    def _predict_proba(self, X, vectorizer, scaler, clf):
        X_vec = vectorizer.transform(X[text_label]).toarray()
        X_vec_senti = np.append(X_vec, X[[senti_label]].to_numpy(), axis=1)
        X_normalized = scaler.transform(X_vec_senti)
        return clf.predict_proba(X_normalized)

    def _predict(self, X, vectorizer, scaler, clf):
        predicted_proba = self._predict_proba(X, vectorizer, scaler, clf)
        return np.argmax(predicted_proba, axis=1)

    def _predict_bug(self, X):
        return self._predict(X, self.bug_vectorizer, self.bug_scaler, self.bug_clf)

    def fit(self, X_bug, y_bug, X_feat, y_feat, X_rating, y_rating, X_ux, y_ux):
        self._fit_bug(X_bug, y_bug)
        self._fit_feat(X_feat, y_feat)
        self._fit_rating(X_rating, y_rating)
        self._fit_ux(X_ux, y_ux)


In [121]:
ova_clf = OVRClassifier()
ova_clf.fit(X_train_bug, y_train_bug, X_train_feat, y_train_feat,
            X_train_rating, y_train_rating, X_train_ux, y_train_ux)
predicted_bug_test = ova_clf._predict_bug(X_test_bug)
print(f'''Testing
Accuracy: {accuracy_score(y_test_bug, predicted_bug_test)}
F1 score: {f1_score(y_test_bug, predicted_bug_test)}
Precision: {precision_score(y_test_bug, predicted_bug_test)}
Recall: {recall_score(y_test_bug, predicted_bug_test)}
''')

Training accuracy: 0.9697933227344993
Training accuracy: 0.9770554493307839
Training accuracy: 0.9745627980922098
Training accuracy: 0.9475357710651828
Testing
Accuracy: 0.7387387387387387
F1 score: 0.7563025210084034
Precision: 0.7142857142857143
Recall: 0.8035714285714286



In [122]:
X_test_bug

Unnamed: 0,stopwords_removal_nltk,sentiScore
120,"love app, cant access recipes saved online sav...",3
416,probably great app 4 up ive used several 2 yea...,3
334,brilliant app crashes ipad iphone,2
350,"hotel reviews, especially poor hotels, fake li...",-2
412,cant even download purchased game wont even do...,-1
...,...,...
582,thanks thought & effort went creating app hand...,3
641,bad,-3
548,somehow lost contacts app got everything back,-1
148,its awesome however recently lapse receive mes...,3


In [123]:
y_test_bug

120    1
416    0
334    1
350    1
412    0
      ..
582    0
641    0
548    0
148    1
324    1
Name: label, Length: 111, dtype: int64