<h2>Define class

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [3]:
class Review:
    def __init__(self, index, text, score, sentiment):
        self.index = index
        self.text = text
        self.score = score
        self.sentiment = sentiment
    def showText(self):
        return self.text
    def showScore(self):
        return self.score
    def showSentiment(self):
        return self.sentiment

<h3>Generate file list</h3>

In [4]:
import pandas as pd
import os
import glob
import random
    
def mySort(n):
    return int(n.split('_')[0])

def load_review(path, start, end, sentiment):
    reviews = []
    filelist = []
    os.chdir(path)
    for files in glob.glob('*.txt'):
        fileName, fileExtension = os.path.splitext(files)
        filelist.append(fileName)#filename without extension
        
    filelist.sort(key=mySort)
    
    shrunk_filesList = filelist[start:start+end]
    
    #print(shrunk_filesList[0:10])
    
    for file in shrunk_filesList:
        f = open(path + "/" + file + '.txt', 'r')
        index = int(file.split('_')[0])
        score = int(file.split('_')[1])
        text = f.read()
        reviews.append(Review(index, text, score, sentiment))
        f.close()
    return reviews


In [5]:
train_pos = '/Users/huanghui/Documents/北科相關/Big Data Analysis/Homework/Homework/Final project/Code/train/pos'
train_neg = '/Users/huanghui/Documents/北科相關/Big Data Analysis/Homework/Homework/Final project/Code/train/neg'
test_pos = '/Users/huanghui/Documents/北科相關/Big Data Analysis/Homework/Homework/Final project/Code/test/pos'
test_neg = '/Users/huanghui/Documents/北科相關/Big Data Analysis/Homework/Homework/Final project/Code/test/neg'

def gen_train_test(dataNum):
    train_pos_reviews = load_review(train_pos, 0, dataNum, 'Positive')
    train_neg_reviews = load_review(train_neg, 0, dataNum, 'Negative')
    test_pos_reviews = load_review(test_pos, 0, dataNum, 'Positive')
    test_neg_reviews = load_review(test_neg, 0, dataNum, 'Negative')

    train_reviews = train_pos_reviews + train_neg_reviews
    test_reviews = test_pos_reviews + test_neg_reviews

    random.shuffle(train_reviews)
    random.shuffle(test_reviews)
    
    return train_reviews, test_reviews

---

<h1>Prepare data</h1>

In [6]:

def get_vector(Num):
    train_reviews, test_reviews = gen_train_test(Num)
    #print(len(train_reviews))

    train_text = [x.text for x in train_reviews]
    train_sentiment = [x.sentiment for x in train_reviews]

    test_text = [x.text for x in test_reviews]
    test_sentiment = [x.sentiment for x in test_reviews]

    vectorizer = TfidfVectorizer()
    train_text_vector= vectorizer.fit_transform(train_text)
    test_text_vector= vectorizer.transform(test_text)
    return train_text_vector, train_sentiment, test_text_vector, test_sentiment


In [7]:
def modeling(train_text_vector, train_sentiment):
    clf_svm = svm.SVC(kernel = 'linear')
    clf_svm.fit(train_text_vector,train_sentiment)
    clf_dec = DecisionTreeClassifier(random_state=0)
    clf_dec.fit(train_text_vector,train_sentiment)
    clf_log = LogisticRegression()
    clf_log.fit(train_text_vector,train_sentiment)

---

<h1>Evaluation</h1>

<h3>F1 score</h3>

In [8]:
def get_f1(test_sentiment,test_text_vector, clf_svm, clf_dec, clf_log):
    f1_svm = f1_score(test_sentiment, clf_svm.predict(test_text_vector), average=None, labels = ['Positive', 'Negative'])
    f1_dec = f1_score(test_sentiment, clf_dec.predict(test_text_vector), average=None, labels = ['Positive', 'Negative'])
    f1_log = f1_score(test_sentiment, clf_log.predict(test_text_vector), average=None, labels = ['Positive', 'Negative'])
    return f1_svm, f1_dec, f1_log

In [9]:
def run_module(Num):
    train_text_vector, train_sentiment, test_text_vector, test_sentiment = get_vector(Num)
    
    clf_svm = svm.SVC(kernel = 'poly', coef0 = 3.0, degree = 7)
    clf_svm.fit(train_text_vector,train_sentiment)
    clf_dec = DecisionTreeClassifier(random_state=0)
    clf_dec.fit(train_text_vector,train_sentiment)
    clf_log = LogisticRegression(solver = 'saga',penalty = 'l2')
    clf_log.fit(train_text_vector,train_sentiment)
    
    f1_svm, f1_dec, f1_log = get_f1(test_sentiment,test_text_vector, clf_svm, clf_dec, clf_log)
    return f1_svm, f1_dec, f1_log

def main():
    N = int(input('Please input interval: '))
    svm_result = []
    dec_result = []
    log_result = []
    #for i in range(N,4001,N):
    #    f1_svm, f1_dec, f1_log = run_module(i)
    #    svm_result.append([f1_svm[0], f1_svm[1]])
    #    dec_result.append([f1_dec[0], f1_dec[1]])
    #    log_result.append([f1_log[0], f1_log[1]])
    
    #print('Decisiontree:', dec_result)
    f1_svm, f1_dec, f1_log = run_module(N)
    svm_result.append([f1_svm[0], f1_svm[1]])
    print('SVM: ',svm_result)
    #print('LogisticRegression:', log_result)
    
main()

Please input interval: 10000
SVM:  [[0.879375472173256, 0.8811119384462646]]


---

In [31]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'poly', 'rbf'), 'C':(1,2,4,8,16)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_text_vector,train_sentiment)

KeyboardInterrupt: 