In [1]:
# imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import VotingClassifier



In [2]:
aad1 = ['B004NWLM8K', 'B004Q1NH4U', 'B004LPBTAA']
aad2 = ['B004S6NAOU', 'B004R6HTWU', 'B004N8KDNY']
aad3 = ['B004KA0RBS', 'B004NPELDA', 'B004L26XXQ']

In [3]:
### this cell is only for declaring functions

def readData(filePath): # reads the file and returns a list of the ratings given to a review
    ratings = []
    identifiers = []
    texts = []
    file = open(filePath)
    lines = file.readlines()
    for line in lines:
        ratings.append(int(line.split('\t')[0]))
        identifiers.append(line.split('\t')[1])
        texts.append(line.split('\t')[2])
    return ratings, identifiers, texts


def giveMeBag(trainData): # creates a count vectorizer and retrns its features bag of words and a list of its words to be used as vocabulary for another count vectorizer
    trainingCVect = CountVectorizer(max_features=30000,stop_words='english')
    training_bow = trainingCVect.fit_transform(trainData)
    training_words = trainingCVect.get_feature_names_out()
    return training_bow,training_words

def giveMeTreeTrained(training_data,training_labels): # creates and trains a decision tree
    dec_tree = DecisionTreeClassifier()
    dec_tree.fit(training_data,training_labels)
    return dec_tree

def giveMeKnn(training_data,training_labels,k): # creates and trains a knn classifier 
    knnc = KNeighborsClassifier(n_neighbors=k)
    knnc.fit(training_data,training_labels)
    return knnc

def giveMeForest(training_data,training_labels,n): #creates a random forest classifier, trains it and returns the trained forest
    forest = RandomForestClassifier(n_estimators=n)
    forest.fit(training_data,training_labels)
    return forest

def giveMeLSVM(training_data,training_label): #creates a linear SVM classifier, trains it and returns the trained classifier
    svm = LinearSVC()
    svm.fit(training_data,training_label)
    return svm

def giveMeNLSVM(training_data,training_label): #creates a non linear SVM classifier, trains it and returns the trained classifier
    svm = SVC(kernel='rbf')
    svm.fit(training_data,training_label)
    return svm

def giveMeGoodApps(predictedLab,appList): #taking as input the predicted labels of the test data ans the identifiers of the test data, it returns a new list containing only apps with predicted positive sentiment (3)
    goodApps = []
    counter = 0
    for label in predictedLab:
        if label == 3:
            goodApps.append(appList[counter])
        counter += 1
    return goodApps

def giveMeBadApps(predictedLab,appList): #opposite of giveMeGoodApps
    badApps = []
    counter = 0
    for label in predictedLab:
        if label == 1:
            badApps.append(appList[counter])
        counter += 1
    return badApps

def giveMeBestDev(appList): # taking in as input all the apps with predicted positive sentiment, this function counts each instance where the identifier of the app is one of our observed 9 and adds 1 to the proper developer company.
    aad1Num, aad2Num, aad3Num = 0,0,0
    for app in appList:
        if app in aad1:
            aad1Num += 1
        elif app in aad2:
            aad2Num += 1
        elif app in aad3:
            aad3Num += 1
    return aad1Num, aad2Num, aad3Num




In [4]:
# instanciations
trainingRatings,trainingIdentifiers,trainingReviews = readData('reviews_Apps_for_Android_5.training.txt') # training instances
trainingWordsBag,trainingWords = giveMeBag(trainingReviews) # method which creates a training bag of words and vocabulary to use for test

testRatings,testIdentifiers,testReviews = readData('reviews_Apps_for_Android_5.test.txt') # reading test data

testCV = CountVectorizer(stop_words='english', vocabulary=trainingWords) # create test count vectoriser using the training vocabulary
testWordsBag = testCV.fit_transform(testReviews) # create bag of test words to be predicted

In [5]:
# DECISION TREE TRAINING
trainedTree = giveMeTreeTrained(trainingWordsBag,trainingRatings) # trained tree from training data
DTtestPredLabels = trainedTree.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test

#compute precision, recall and f-measure for each classification label (DECISION TREE)
print('DECISION TREE RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,DTtestPredLabels))

DECISION TREE RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.45      0.41      0.43      3469
           2       0.21      0.18      0.19      2087
           3       0.81      0.84      0.83     14443

    accuracy                           0.70     19999
   macro avg       0.49      0.48      0.48     19999
weighted avg       0.69      0.70      0.69     19999



In [6]:
# KNN (K=1) TRAINING
trainedK = giveMeKnn(trainingWordsBag,trainingRatings,1) # trained tree from training data
K1testPredLabels = trainedK.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test
#compute precision, recall and f-measure for each classification label (DECISION TREE)
print('K-NN (K=1) RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,K1testPredLabels))

K-NN (K=1) RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.30      0.34      0.32      3469
           2       0.14      0.15      0.15      2087
           3       0.78      0.74      0.76     14443

    accuracy                           0.61     19999
   macro avg       0.41      0.41      0.41     19999
weighted avg       0.63      0.61      0.62     19999



In [7]:
# KNN (K=3) TRAINING
trainedK3 = giveMeKnn(trainingWordsBag,trainingRatings,3) # trained tree from training data
K3testPredLabels = trainedK3.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test

#compute precision, recall and f-measure for each classification label (DECISION TREE)
print('K-NN (K=3) RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,K3testPredLabels))

K-NN (K=3) RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.30      0.41      0.35      3469
           2       0.16      0.05      0.08      2087
           3       0.79      0.79      0.79     14443

    accuracy                           0.65     19999
   macro avg       0.41      0.42      0.40     19999
weighted avg       0.64      0.65      0.64     19999



In [8]:
# KNN (K=15) TRAINING
trainedK15 = giveMeKnn(trainingWordsBag,trainingRatings,15) # trained tree from training data
K15testPredLabels = trainedK15.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test

#compute precision, recall and f-measure for each classification label (DECISION TREE)
print('K-NN (K=15) RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,K15testPredLabels))

K-NN (K=15) RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.42      0.14      0.21      3469
           2       0.23      0.03      0.05      2087
           3       0.74      0.96      0.84     14443

    accuracy                           0.72     19999
   macro avg       0.46      0.37      0.37     19999
weighted avg       0.63      0.72      0.65     19999



In [9]:
# KNN (K=20) TRAINING
trainedK20 = giveMeKnn(trainingWordsBag,trainingRatings,20) # trained tree from training data
K20testPredLabels = trainedK20.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test

#compute precision, recall and f-measure for each classification label (DECISION TREE)
print('K-NN (K=20) RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,K20testPredLabels))

K-NN (K=20) RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.46      0.09      0.16      3469
           2       0.27      0.02      0.03      2087
           3       0.74      0.98      0.84     14443

    accuracy                           0.72     19999
   macro avg       0.49      0.36      0.34     19999
weighted avg       0.64      0.72      0.64     19999



In [10]:
## Linear SVM
lsvm = giveMeLSVM(trainingWordsBag, trainingRatings)
LSVMtestPredLabels = lsvm.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test

#compute precision, recall and f-measure for each classification label
print('SVM RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,LSVMtestPredLabels))

SVM RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.59      0.54      0.56      3469
           2       0.24      0.20      0.22      2087
           3       0.85      0.88      0.86     14443

    accuracy                           0.75     19999
   macro avg       0.56      0.54      0.55     19999
weighted avg       0.74      0.75      0.74     19999





In [11]:
## Non Linear SVM
svm2 = giveMeNLSVM(trainingWordsBag, trainingRatings)
NLSVMtestPredLabels = svm2.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test

#compute precision, recall and f-measure for each classification label
print('Non-Linear SVM RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,NLSVMtestPredLabels))

Non-Linear SVM RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.72      0.42      0.53      3469
           2       0.57      0.09      0.16      2087
           3       0.80      0.98      0.88     14443

    accuracy                           0.79     19999
   macro avg       0.70      0.50      0.52     19999
weighted avg       0.76      0.79      0.74     19999



In [12]:
# Voting based on previous results
 # Create the voting classifier
voting_classifier = VotingClassifier(estimators=[
    ('tree', trainedTree),
    ('knn15', trainedK15),
    ('knn20', trainedK20),
    ('lsvm', lsvm)], voting='hard')

# Train the voting classifier on the training data
voting_classifier.fit(trainingWordsBag, trainingRatings)

# Make predictions on the test data
VtestPredLabels = voting_classifier.predict(testWordsBag)

#compute precision, recall and f-measure for each classification label (All previous predictions)
print('Voting RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,VtestPredLabels))



Voting RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.61      0.35      0.45      3469
           2       0.33      0.07      0.12      2087
           3       0.78      0.95      0.86     14443

    accuracy                           0.76     19999
   macro avg       0.57      0.46      0.48     19999
weighted avg       0.71      0.76      0.71     19999



In [13]:
# random forest classifier
trainedForest = giveMeForest(trainingWordsBag,trainingRatings,20) # trained forest from training data
RFtestPredLabels = trainedForest.predict(testWordsBag) # use the earlier trained decision tree to predict the labels of the test

#compute precision, recall and f-measure for each classification label (DECISION TREE)
print('RANDOM FOREST RESULTS\n-------------------------------------------------------------------------------')
print(classification_report(testRatings,RFtestPredLabels))

RANDOM FOREST RESULTS
-------------------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.67      0.37      0.48      3469
           2       0.44      0.07      0.13      2087
           3       0.79      0.97      0.87     14443

    accuracy                           0.77     19999
   macro avg       0.63      0.47      0.49     19999
weighted avg       0.73      0.77      0.73     19999



In [14]:
## Aggregating the results to find out which apps have best performance and which company has those apps

def printDevResults(predictedLabels): # FUNCTION TO AUTOMATICALLY PRINT result of developers
    goodApps = giveMeGoodApps(predictedLabels,testIdentifiers) #returns all apps which were predicted to have positive sentiment
    a1,a2,a3= giveMeBestDev(goodApps) #returns the number of positive reviews each developer has
    print('AAD 1:', a1, '\nAAD 2:', a2, '\nAAD 3:', a3, '\n_______________________________\n')

# def printDevResults(predictedLabels): # FUNCTION TO AUTOMATICALLY PRINT result of developers
#     goodApps = giveMeBadApps(predictedLabels,testIdentifiers) #returns all apps which were predicted to have positive sentiment
#     a1,a2,a3= giveMeBestDev(goodApps) #returns the number of positive reviews each developer has
#     print('AAD 1:', a1, '\nAAD 2:', a2, '\nAAD 3:', a3, '\n_______________________________\n')

print('DECISION TREE')
printDevResults(DTtestPredLabels)

print('K-NN (K=15)')
printDevResults(K15testPredLabels)

print('K-NN (K=20)')
printDevResults(K20testPredLabels)

print('Linear SVM')
printDevResults(LSVMtestPredLabels)

print('Non-Linear SVM')
printDevResults(NLSVMtestPredLabels)

print('Voting Ensemble')
printDevResults(VtestPredLabels)

print('Random Forest')
printDevResults(RFtestPredLabels)


DECISION TREE
AAD 1: 89 
AAD 2: 186 
AAD 3: 88 
_______________________________

K-NN (K=15)
AAD 1: 102 
AAD 2: 263 
AAD 3: 139 
_______________________________

K-NN (K=20)
AAD 1: 104 
AAD 2: 271 
AAD 3: 152 
_______________________________

Linear SVM
AAD 1: 97 
AAD 2: 155 
AAD 3: 75 
_______________________________

Non-Linear SVM
AAD 1: 106 
AAD 2: 223 
AAD 3: 115 
_______________________________

Voting Ensemble
AAD 1: 102 
AAD 2: 222 
AAD 3: 114 
_______________________________

Random Forest
AAD 1: 105 
AAD 2: 219 
AAD 3: 112 
_______________________________

