In [34]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
import numpy as np
import csv
import sqlite3
import pandas as pd

#trained td-idf term-document matrix
matrix = []
#test td-idf term-document matrix
test_matrix = []
#term list from td-idf term-document matrix
tml = []
#hotol name with index
namelist = []
#string for each hotel
m = []


def compute_dataset():
    global matrix
    global namelist
    global tml
    global m
    my_stop_words = text.ENGLISH_STOP_WORDS.union(["hotel"])
    csv_file=open('pr_result.csv') 
    csv_reader_lines = csv.reader(csv_file)
    next(csv_reader_lines)
    csv.field_size_limit(100000000)
    for one_line in csv_reader_lines:
        terms = one_line[1]
        hotel = one_line[0]
        m.append(terms)
        namelist.append(hotel)
        #del one_line[-1]
    csv_file.close()
    vectorizer = TfidfVectorizer(use_idf = TRUE, stop_words= my_stop_words)
    matrix = vectorizer.fit_transform(m)
    tml = vectorizer.get_feature_names()

#Top n tfidf value in row and return them with their corresponding term names in matrix
def top_tfidf_features(row, features, top_n=25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

#Top n tfidf terms in specific hotel (matrix row)
def top_features_in_hotel(matrix, features, row_id, top_n=10):
    row = np.squeeze(matrix[row_id].toarray())
    return top_tfidf_features(row, features, top_n)

#Return the top n terms that on average are most important amongst all hotels
def top_mean_features(matrix, features, min_tfidf=0.1, top_n=10):
    data = matrix.toarray()
    data[data < min_tfidf] = 0
    tfidf_means = np.mean(data, axis=0)
    return top_tfidf_features(tfidf_means, features, top_n)

compute_dataset()
top_features_in_hotel(matrix, tml, 0)


Unnamed: 0,feature,tfidf
0,staff,0.348243
1,room,0.324585
2,nice,0.203826
3,park,0.202027
4,location,0.197383
5,great,0.195974
6,good,0.193384
7,building,0.183415
8,bed,0.182993
9,tram,0.178679


In [35]:
top_mean_features(matrix, tml)

Unnamed: 0,feature,tfidf
0,staff,0.322903
1,location,0.309481
2,room,0.216418
3,good,0.169889
4,great,0.16107
5,friendly,0.126987
6,helpful,0.111692
7,breakfast,0.105881
8,nice,0.094509
9,clean,0.081924


In [39]:
def compute_performance():    
    #Stratified 10-cross fold validation with SVM and Multinomial NB 
    labels = np.zeros(1494)
    labels[0:800]=0
    labels[800:1494]=1
    
    kf = StratifiedKFold(n_splits=10)
    
    SvmCM = np.zeros((2,2))  # Confusion matrix in SVM
    MNBCM = np.zeros((2,2))  # Confusion matrix in MulNB
    totalSvm = 0
    totalMNB = 0
    
    for train_index, test_index in kf.split(m,labels):
        x_train2 = [m[i] for i in train_index]
        x_test2 = [m[i] for i in test_index]
        y_train2, y_test2 = labels[train_index], labels[test_index]
        vectorizer2 = TfidfVectorizer(use_idf=True,stop_words='english')
        matrix2 = vectorizer2.fit_transform(x_train2) 
        test_matrix2 = vectorizer2.transform(x_test2)
        
        model1 = LinearSVC()
        model2 = MultinomialNB()    
        model1.fit(matrix2,y_train2)
        model2.fit(matrix2,y_train2)
        result1 = model1.predict(test_matrix2)
        result2 = model2.predict(test_matrix2)
        SvmCM = SvmCM + confusion_matrix(y_test2, result1)
        MNBCM = MNBCM + confusion_matrix(y_test2, result2)
        totalSvm = totalSvm+sum(y_test2==result1)
        totalMNB = totalMNB+sum(y_test2==result2)
     
    df1 = pd.DataFrame(SvmCM)
    df1.columns = ['Negative', 'Positive']
    
    df2 = pd.DataFrame(MNBCM)
    df2.columns = ['Negative', 'Positive']
    
    print("SVM Confusion Matrix:")
    print(df1)
    print("MulNB Confusion Matrix:")
    print(df2)
    score1 = totalSvm/1494
    score2 = totalMNB/1494
    print("SVM Accuracy:")
    print(score1)
    print("MulNB Accuracy:")
    print(score2)

compute_performance()

SVM Confusion Matrix:
   Negative  Positive
0     775.0      25.0
1      74.0     620.0
MulNB Confusion Matrix:
   Negative  Positive
0     787.0      13.0
1     137.0     557.0
SVM Accuracy:
0.933734939759
MulNB Accuracy:
0.899598393574
