In [73]:
import pandas as pd
import numpy as np
import sklearn
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import string

In [74]:
# nltk.download('stopwords')

### Data processing

In [75]:
df = pd.read_csv("data.csv", encoding='Latin1')

In [76]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [77]:
print(df.shape)

(5842, 2)


In [78]:
df.dropna(inplace=True) #removing all empty spaces

In [79]:
# Convert all sentences to lowercase
df['Sentence'] = [entry.lower() for entry in df['Sentence']]

mp = {'positive': 2, 'negative': 0, 'neutral': 1}
df['Sentiment'] = df['Sentiment'].map(mp)


In [80]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,the geosolutions technology will leverage bene...,2
1,"$esi on lows, down $1.50 to $2.50 bk a real po...",0
2,"for the last quarter of 2010 , componenta 's n...",2
3,according to the finnish-russian chamber of co...,1
4,the swedish buyout firm has sold its remaining...,1


In [81]:
sentences = [sen.split() for sen in df["Sentence"]]
print(sentences[0]) # like word_tokenize

['the', 'geosolutions', 'technology', 'will', 'leverage', 'benefon', "'s", 'gps', 'solutions', 'by', 'providing', 'location', 'based', 'search', 'technology', ',', 'a', 'communities', 'platform', ',', 'location', 'relevant', 'multimedia', 'content', 'and', 'a', 'new', 'and', 'powerful', 'commercial', 'model', '.']


### no_punctuation, stopWords, stemming 

In [82]:
# punc
for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word.isalnum()]
print(sentences[0])

['the', 'geosolutions', 'technology', 'will', 'leverage', 'benefon', 'gps', 'solutions', 'by', 'providing', 'location', 'based', 'search', 'technology', 'a', 'communities', 'platform', 'location', 'relevant', 'multimedia', 'content', 'and', 'a', 'new', 'and', 'powerful', 'commercial', 'model']


In [83]:
# stopWords
stop_words = set(stopwords.words('english'))

for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stop_words]
print(sentences[0])
# Output: don't have stop words in sentence[i] any more

['geosolutions', 'technology', 'leverage', 'benefon', 'gps', 'solutions', 'providing', 'location', 'based', 'search', 'technology', 'communities', 'platform', 'location', 'relevant', 'multimedia', 'content', 'new', 'powerful', 'commercial', 'model']


In [84]:
stemmer = PorterStemmer()

for i in range(len(sentences)):
    sentences[i] = [stemmer.stem(word) for word in sentences[i]]

print(sentences[0])
# Output: running to run, location to locat

['geosolut', 'technolog', 'leverag', 'benefon', 'gp', 'solut', 'provid', 'locat', 'base', 'search', 'technolog', 'commun', 'platform', 'locat', 'relev', 'multimedia', 'content', 'new', 'power', 'commerci', 'model']


In [85]:
processed_sentences = [' '.join(sentence) for sentence in sentences]

# Target variable
y = df['Sentiment'].values  

X_train, X_test, y_train, y_test = train_test_split(processed_sentences, y, test_size=0.2, random_state=42)

# Vectorize the text data using TfidfVectorizer
tfidf = TfidfVectorizer(max_features=15000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.toarray())


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [116]:
# Train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 71.34%

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.15      0.22       175
           1       0.71      0.89      0.79       622
           2       0.79      0.68      0.73       372

    accuracy                           0.71      1169
   macro avg       0.64      0.57      0.58      1169
weighted avg       0.69      0.71      0.68      1169



In [115]:
y_pred = svm_model.predict(X_train_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_train, y_pred))

Accuracy: 84.25%

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.39      0.54       685
           1       0.80      0.97      0.88      2508
           2       0.93      0.84      0.88      1480

    accuracy                           0.84      4673
   macro avg       0.87      0.73      0.77      4673
weighted avg       0.85      0.84      0.83      4673



In [87]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = log_reg_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 70.40%

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.14      0.22       175
           1       0.70      0.89      0.78       622
           2       0.76      0.66      0.70       372

    accuracy                           0.70      1169
   macro avg       0.64      0.56      0.57      1169
weighted avg       0.68      0.70      0.67      1169



In [88]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Random Forest Accuracy: 66.04%

Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.16      0.19       175
           1       0.70      0.77      0.74       622
           2       0.71      0.71      0.71       372

    accuracy                           0.66      1169
   macro avg       0.55      0.55      0.55      1169
weighted avg       0.64      0.66      0.65      1169



In [89]:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = nb_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Naive Bayes Accuracy: 66.81%

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.05      0.09       175
           1       0.65      0.96      0.77       622
           2       0.75      0.47      0.57       372

    accuracy                           0.67      1169
   macro avg       0.73      0.49      0.48      1169
weighted avg       0.70      0.67      0.61      1169



In [90]:
from sklearn.neighbors import KNeighborsClassifier

# Train a K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=70)
knn_model.fit(X_train_tfidf, y_train)

# Make y_pred
y_pred = knn_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


KNN Accuracy: 65.53%

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.03      0.07       175
           1       0.65      0.94      0.77       622
           2       0.68      0.48      0.56       372

    accuracy                           0.66      1169
   macro avg       0.69      0.48      0.46      1169
weighted avg       0.67      0.66      0.60      1169



In [125]:
def convert_to_dense_if_sparse(X):
    if hasattr(X, 'toarray'):
        return X.toarray()
    return X

class SVM_classifier():
    def __init__(self, learning_rate=0.01, lambda_=10, iterations=1000):
        self.learning_rate = learning_rate
        self.lambda_ = lambda_
        self.iterations = iterations
        self.W = None
        self.B = None
    
    def compute_hingle_loss(self, W, B, X_batch, y_batch):
        """ calculate hinge loss """
        N = X_batch.shape[0]
        distance = []
        for idx, x in enumerate(X_batch):
            distance.append(max(0, 1 - y_batch[idx] * (np.dot(x, W) + B)))

        distances = np.array(distance) # let distance into numpy array 
    
        hinge_loss = self.lambda_ * (np.sum(distances) / N) # find hinge loss
        
        # calculate cost
        cost = 1 / 2 * np.dot(W, W) + hinge_loss
        return cost

    def gradientDescent(self, W, B, X_batch, Y_batch):
        distance = []
        for idx, x in enumerate(X_batch):
            distance.append(1 - Y_batch[idx] * (np.dot(x, W) + B))
    
        dw = np.zeros(len(W))
        dB = 0
        for idx, d in enumerate(distance):
            if max(0, d) == 0:
                dw += W
                dB += 0
            else:
                dw += W - (self.lambda_ * Y_batch[idx] * X_batch[idx])
                dB += 0 - (self.lambda_ * Y_batch[idx])
        
        dw = dw / len(Y_batch)  # average
        dB = dB / len(Y_batch)  # avg
        return dw, dB
        
    def fit(self, features, outputs) -> bool:
        # print(features)
        features = convert_to_dense_if_sparse(features)

        # print(features.shape)
        max_epochs = self.iterations
        weights = np.zeros(features.shape[1])
        bias = 0
        nth = 0

        prev_cost = float("inf")
        cost_threshold = 0.01  # in percent
        
        for epoch in range(1, max_epochs):
            gradW, gradB = self.gradientDescent(weights, bias, features, outputs)

            # convergence check on 2^nth epoch
            if epoch == 2 ** nth or epoch == max_epochs - 1:
                cost = self.compute_hingle_loss(weights, bias, features, outputs)
                print("Epoch is:{} and Cost is: {}".format(epoch, cost))
                # stoppage criterion
                if abs(prev_cost - cost) < cost_threshold * prev_cost:
                    self.W = weights
                    self.B = bias
                    return True
                prev_cost = cost
                nth += 1
            
            # update grad
            weights = weights - (self.learning_rate * gradW)
            bias = bias - (self.learning_rate * gradB)
            
        self.W = weights
        self.B = bias
        return True
    
    def decisionFunc(self, X):
        X = convert_to_dense_if_sparse(X)
        ans = []
        for x in X:
            ans.append(np.dot(x, self.W) + self.B)
        return np.array(ans)
    
    def predict(self, X):
        X = convert_to_dense_if_sparse(X)
        # print(X)
        prediction = []
        for x in X:
            prediction.append(np.dot(x, self.W) + self.B) # w.x + b
        
        # print(np.sign(prediction))
        return np.sign(prediction)

    # Evaluate the model
    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        correct = 0
        cnt_pos = 0
        cnt_neg = 0
        for i in range(predictions.shape[0]):
            if predictions[i] == y_test[i]:
                correct += 1
            
            if y_test[i] == 1:
                cnt_pos += 1
            else:
                cnt_neg += 1
        accuracy = correct / y_test.shape[0]
        print(f'Accuracy: {accuracy * 100:.2f}%')
        print(f"Pos_rate: {cnt_pos / y_test.shape[0] * 100:.2}%")
        return accuracy
    

class OvRClassifier:
    def __init__(self, n_classes, learning_rate=0.001, lambda_=10000, iterations=1000):
        self.n_classes = n_classes
        self.models = [SVM_classifier(learning_rate, lambda_, iterations) for _ in range(n_classes)]

    def fit(self, X, y):
        for i in range(self.n_classes):
            y_binary = np.where(y == i, 1, -1) # in y and val == i then in y_binary = 1, others = -1 
            self.models[i].fit(X, y_binary)

    def predict(self, X):
        decision_values = np.array([model.decisionFunc(X) for model in self.models])
        return np.argmax(decision_values, axis=0)
    
    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        correct = 0
        cnt_pos = 0
        cnt_neg = 0
        cnt_net = 0
        for i in range(predictions.shape[0]):
            if predictions[i] == y_test[i]:
                correct += 1
            
            if y_test[i] == 2:
                cnt_pos += 1
            elif y_test[i] == 0:
                cnt_neg += 1
            else:
                cnt_net += 1
                
        accuracy = correct / y_test.shape[0]
        print(f'Accuracy: {accuracy * 100:.2f}%')
        print(f"Number: {cnt_pos}, {cnt_neg}, {cnt_net}")
        return accuracy

In [126]:
svm = OvRClassifier(n_classes=3)


In [127]:
svm.fit(X_train_tfidf, y_train)

Epoch is:1 and Cost is: 10000.0
Epoch is:2 and Cost is: 11942.179702986883
Epoch is:4 and Cost is: 7543.963381272171
Epoch is:8 and Cost is: 11877.288286360781
Epoch is:16 and Cost is: 7745.318347159508
Epoch is:32 and Cost is: 12359.092777590484
Epoch is:64 and Cost is: 3867.706701647922
Epoch is:128 and Cost is: 2984.945406229369
Epoch is:256 and Cost is: 2919.3158522412864
Epoch is:512 and Cost is: 2541.088397144313
Epoch is:999 and Cost is: 3030.508436172383
Epoch is:1 and Cost is: 10000.0
Epoch is:2 and Cost is: 9349.060194380203
Epoch is:4 and Cost is: 22501.723882930546
Epoch is:8 and Cost is: 14498.441942188265
Epoch is:16 and Cost is: 9630.768945856384
Epoch is:32 and Cost is: 20201.362792690663
Epoch is:64 and Cost is: 15784.08380024346
Epoch is:128 and Cost is: 16765.80851846506
Epoch is:256 and Cost is: 14298.77589198786
Epoch is:512 and Cost is: 10999.593209766615
Epoch is:999 and Cost is: 8024.036871774681
Epoch is:1 and Cost is: 10000.0
Epoch is:2 and Cost is: 14878.8367

In [129]:
svm.evaluate(X_test_tfidf, y_test)

Accuracy: 69.89%
Number: 372, 175, 622


0.6988879384088965