In [16]:
import sklearn
import nltk
import pandas as pd
import numpy as np
import os
import operator
from sklearn.linear_model import LinearRegression
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from matplotlib.pyplot import plot
nltk.download('stopwords') # If needed
nltk.download('punkt') # If needed
nltk.download('wordnet') # If needed
nltk.download('omw-1.4') # If needed

[nltk_data] Downloading package stopwords to C:\Users\Nathan
[nltk_data]     Hua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Nathan
[nltk_data]     Hua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nathan
[nltk_data]     Hua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Nathan
[nltk_data]     Hua\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Data Preprocessing

In [17]:
# Label mapping declaration
SENTIMENTAL_MAP = {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
}

In [18]:
for k in SENTIMENTAL_MAP.keys():
    print(k)

0
1
2


In [19]:
# read data from txt
def read_data(set_name):
    text_file_name  = set_name + "_text.txt"
    label_file_name = set_name + "_labels.txt"
    text_file = open("data/" + text_file_name, "r", encoding="utf8")
    label_file = open("data/" + label_file_name, "r", encoding="utf8")
    x = text_file.readlines()
    y = label_file.readlines()
    for i in range(len(y)): y[i] = y[i][0]
    return pd.DataFrame(x, columns=["text"]), pd.DataFrame(y, columns=["label"])

In [20]:
train_set_x, train_set_y = read_data("train")
val_set_x,   val_set_y   = read_data("val")
test_set_x,  test_set_y  = read_data("test")

In [21]:
print(str(train_set_x.shape))
print(str(train_set_y.shape))
print(str(val_set_x.shape))
print(str(val_set_y.shape))
print(str(test_set_x.shape))
print(str(test_set_y.shape))

(45615, 1)
(45615, 1)
(2000, 1)
(2000, 1)
(12284, 1)
(12284, 1)


In [22]:
print(val_set_x.loc[7, "text"])

Omg this show is so predictable even for the 3rd ep. Rui En\u2019s ex boyfriend was framed for murder probably\u002c by the rich guy. 



In [23]:
#TODO: remove meaningless text, such as "@user" "#sometagname"
#TODO: remove http links, such as "https://t.co/4fPkSVlSDl"
#TODO: convert some unicode string to text, such as "\u2019" => "'"
test = u'\u002c'
print(str(test))

,


## initial feature and its train, val, test process
### TODO: Optimize

In [24]:
def get_vector(vocab, text):
    vector = np.zeros(len(vocab))
    words = []
    for sentence in nltk.tokenize.sent_tokenize(text):
        for token in nltk.tokenize.word_tokenize(sentence):
            words.append(lemmatizer.lemmatize(token).lower())
    for i, word in enumerate(vocab):
        if word in words:
            vector[i] = words.count(word)
    return vector

In [25]:
# Initial Feature: n most frequent words of each label class and combining them together
# define stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))
additional_stopwords = [".", ",", "'s", "``", "''", "'", "n't", "%", "-", "$", "(", ")", ":", ";", "@", "&", "'m", "user", "#", "!", "?", "..."]
for sw in additional_stopwords: stopwords.add(sw)

# Initialize lemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

# Get Vocabulary
vocabulary = []
n = 100
for label in SENTIMENTAL_MAP.keys():
    # get texts with same label
    temp_list = []
    for i in train_set_x.index:
        if train_set_y.loc[i, "label"] == label:
            temp_list.append(train_set_x.loc[i, "text"])
    
    # get n most frequent words of this label class
    dict_word_freq = {}
    for text in temp_list:
        for sentence in nltk.tokenize.sent_tokenize(text):
            for token in nltk.tokenize.word_tokenize(sentence):
                word = lemmatizer.lemmatize(token).lower()
                if word in stopwords: continue
                if word in dict_word_freq: dict_word_freq[word] += 1
                else: dict_word_freq[word] = 1
                
    # sort and add first n words in sorted list to vocabulary
    sorted_list = sorted(dict_word_freq.items(), key=operator.itemgetter(1), reverse=True)
    if n < len(sorted_list): sorted_list = sorted_list[:n]
    for word, frequency in sorted_list:
        if word not in vocabulary: vocabulary.append(word)

In [26]:
# Create training data
x, y = [], []
for i in train_set_x.index:
    x.append(get_vector(vocabulary, train_set_x.loc[i, "text"]))
    y.append(train_set_y.loc[i, "label"])

# Init and train model
svm_clf_category = sklearn.svm.SVC(kernel="linear", gamma='auto')
svm_clf_category.fit(np.asarray(x), np.asarray(y))

# test with val set
x, y = [], []
for i in val_set_x.index:
    x.append(get_vector(vocabulary, val_set_x.loc[i, "text"]))
    y.append(val_set_y.loc[i, "label"])
predictions = svm_clf_category.predict(x)
y = np.asarray(y)

In [27]:
print(str(precision_score(y, predictions, average='macro')))
print(str(recall_score(y, predictions, average='macro')))
print(str(f1_score(y, predictions, average='macro')))
print(str(accuracy_score(y, predictions)))

0.5940554093980991
0.4559994623285762
0.44029333218226435
0.547


# TF-IDF

In [28]:
# Initialize Tfidf Vectorizer
tfidf_vector = TfidfVectorizer()
# Learn vocabulary and idf from training set
tfidf_vector.fit(train_set_x["text"])
# Transform train and test input documents to document-term matrix
tfidf_train_x = tfidf_vector.transform(train_set_x["text"])
tfidf_val_x  = tfidf_vector.transform(val_set_x["text"])

# Train the classifier
svm_clf_category.fit(tfidf_train_x, train_set_y.iloc[:,-1].to_numpy())
# Test with test data
predictions = svm_clf_category.predict(tfidf_val_x)
tfidf_val_y = val_set_y.to_numpy()
    
print(str(precision_score(tfidf_val_y, predictions, average='macro')))
print(str(recall_score(tfidf_val_y, predictions, average='macro')))
print(str(f1_score(tfidf_val_y, predictions, average='macro')))
print(str(accuracy_score(tfidf_val_y, predictions)))


0.6687085346069246
0.6222323738146524
0.6365109146172074
0.674


In [29]:
tfidf_test_x  = tfidf_vector.transform(test_set_x["text"])
predictions = svm_clf_category.predict(tfidf_test_x)
tfidf_test_y = test_set_y.to_numpy()
print(str(precision_score(tfidf_test_y, predictions, average='macro')))
print(str(recall_score(tfidf_test_y, predictions, average='macro')))
print(str(f1_score(tfidf_test_y, predictions, average='macro')))
print(str(accuracy_score(tfidf_test_y, predictions)))

0.598364428981503
0.5654939929988184
0.5680575073372092
0.5944317811787692
