In [1]:
from sklearn import svm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy
from nltk.stem import PorterStemmer
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report
import sklearn_crfsuite
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np

nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/inespissarra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/inespissarra/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
##################################################################################
#                             Read the input database
##################################################################################

train = pd.read_csv('../train.txt', sep='\t', header=None)
train.columns = ['Class', 'Text']

In [3]:
##################################################################################
#                                Preprocessing
##################################################################################

stop = stopwords.words('english')
including = ['no', 'nor', 'not', 'but', 'against', 'only']
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    #tokenize
    words = word_tokenize(text)
    i = 0
    text = ""
    # transforming <word>n't in <word> not from words
    while i < len(words):
        # remove punctuation from words
        words[i] = ''.join([char for char in words[i] if char not in string.punctuation])
        # remove stopwords from words
        if words[i] in stop and words[i] not in including:
            words[i] = ""
        #else:
            # lemmatizing and Stemming from words
            words[i] = stemmer.stem(lemmatizer.lemmatize(words[i]))
        if text != "":
            text = text + " "
        if words[i]!="":
            text = text + words[i]
        i = i+1
    return text

train['Text'] = train['Text'].apply(preprocess)

print("after preprocessing")

after preprocessing


In [10]:
##################################################################################
#         Extracts features (and convert them to sklearn-crfsuite format)
##################################################################################
negation = ["not", "no", "never", "neither", "nor", "none", "nobody", "nowhere", \
            "nothing", "hardly", "scarcely", "barely", "doesn't", "isn't", "wasn't", \
                "shouldn't", "wouldn't", "couldn't", "won't", "can't", "don't", "didn't", \
                    "aren't", "ain't", "without"]
sentiment = SentimentIntensityAnalyzer()
def review2features(review):
    tokens = nltk.word_tokenize(review)
    pos = 0
    neg = 0
    for i in range(len(tokens)):
        pol = sentiment.polarity_scores(tokens[i])
        if ((i-1) >= 0 and tokens[i-1] in negation) and \
            (((i-2)>=0 and tokens[i-2]!="if") or (i-2)<0):                                  # ver melhor, neg e pos
            pol['compound'] = 0 - pol['compound']
        if pol['compound'] > 0:
            pos = pos + pol['compound']
        else:
            neg = neg - pol['compound']
    pos = pos / len(tokens)
    neg = neg / len(tokens)
    polarity = pos - neg
    features = [polarity]
    return features

In [11]:
##################################################################################
#               Creates different vectors (features, tags and tokens)
##################################################################################
X = [review2features(review) for review in train['Text']]

y = train['Class']

print("after features")

after features


In [12]:
##################################################################################
#                                      TF-IDF
##################################################################################

tfidf = TfidfVectorizer(use_idf=True, ngram_range=(1, 3), sublinear_tf=True, max_features=20000)
tfidf_matrix = tfidf.fit_transform(train['Text']).toarray()

print("after tfidf")


after tfidf


In [13]:
print(tfidf_matrix)
print(np.array(X))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0.05153182 0.02363182 0.0279     0.11363636]
 [0.03891786 0.         0.03891786 0.03571429]
 [0.03290685 0.02538904 0.00751781 0.04109589]
 ...
 [0.02353889 0.01869222 0.00484667 0.06666667]
 [0.04083423 0.02494955 0.01588468 0.08108108]
 [0.09916429 0.         0.09916429 0.10714286]]


In [14]:
##################################################################################
#                                     Combine
##################################################################################

combined_features = np.hstack((tfidf_matrix, np.array(X)))

print("after combine")

after combine


In [21]:
##################################################################################
#                                     Split
##################################################################################

X_train, \
    X_test, \
        y_train, \
            y_test \
                = train_test_split(combined_features, y, test_size=0.15, random_state=40)

print("after split")

after split


In [22]:
##################################################################################
#                                     SVM
##################################################################################
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(y_pred)

# ##################################################################################
# #                                   Results
# ##################################################################################

# print("Classification Report:")

# print(classification_report(flat_y_test, flat_y_pred))

# accuracy = accuracy_score(flat_y_test, flat_y_pred)

# print("Accuracy: ", accuracy)

['TRUTHFULNEGATIVE' 'DECEPTIVENEGATIVE' 'TRUTHFULPOSITIVE'
 'TRUTHFULNEGATIVE' 'DECEPTIVENEGATIVE' 'DECEPTIVEPOSITIVE'
 'DECEPTIVEPOSITIVE' 'DECEPTIVEPOSITIVE' 'TRUTHFULPOSITIVE'
 'DECEPTIVENEGATIVE' 'TRUTHFULPOSITIVE' 'TRUTHFULNEGATIVE'
 'TRUTHFULPOSITIVE' 'TRUTHFULNEGATIVE' 'TRUTHFULPOSITIVE'
 'DECEPTIVENEGATIVE' 'DECEPTIVENEGATIVE' 'DECEPTIVEPOSITIVE'
 'TRUTHFULNEGATIVE' 'DECEPTIVEPOSITIVE' 'TRUTHFULPOSITIVE'
 'TRUTHFULNEGATIVE' 'DECEPTIVENEGATIVE' 'TRUTHFULPOSITIVE'
 'TRUTHFULNEGATIVE' 'DECEPTIVENEGATIVE' 'DECEPTIVENEGATIVE'
 'DECEPTIVENEGATIVE' 'DECEPTIVENEGATIVE' 'DECEPTIVENEGATIVE'
 'TRUTHFULPOSITIVE' 'TRUTHFULNEGATIVE' 'DECEPTIVENEGATIVE'
 'DECEPTIVENEGATIVE' 'TRUTHFULPOSITIVE' 'TRUTHFULNEGATIVE'
 'DECEPTIVENEGATIVE' 'DECEPTIVENEGATIVE' 'DECEPTIVENEGATIVE'
 'TRUTHFULNEGATIVE' 'DECEPTIVEPOSITIVE' 'DECEPTIVEPOSITIVE'
 'TRUTHFULNEGATIVE' 'TRUTHFULPOSITIVE' 'DECEPTIVEPOSITIVE'
 'DECEPTIVEPOSITIVE' 'DECEPTIVEPOSITIVE' 'DECEPTIVEPOSITIVE'
 'DECEPTIVEPOSITIVE' 'TRUTHFULPOSITIVE' 'DECE

In [23]:
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: ", accuracy)

# Accuracy: 0.8523809523809524 ++?

Accuracy:  0.8523809523809524
