## Sentiment Analysis from Tweets

This will involve implementing functions for a text classifier, which I will train to identify the **sentiment expressed in a text** in a dataset of approx. 27,000 entries, which will be split into a 80%/20% training/test split.

In [1]:
%pip install unicodecsv
%pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: C:\Users\hnkha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: C:\Users\hnkha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import csv
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support 
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hnkha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
raw_data = []
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding="utf8") as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

## Input and Basic preprocessing

In [4]:
def parse_data_line(data_line):
    return (data_line[1], data_line[2])

In [5]:
# I want to test if my function works
load_data("sentiment-dataset.tsv")
x = raw_data[:5]
for y in x:
    print(y)

("Literally so excited I'm going to a Sam Smith concert in October", 'positive')
('@AngryRaiderFan I know. This, TPP, expanded wars and drone strikes, mass surveillance, on and on...', 'negative')
("@rinashah I have been using Moto G 2nd Gen for over a month now and it's an absolute delight. Stock Android. Good design. Best.", 'positive')
("Juan  Just heard Green Day's 'Time of our life' for the 1st time since leaving florida and i burst into tears. I miss everyone...  Kellogg", 'negative')
("So Fidel Castro has died.  Don't worry, George Soros is willing to fill his shoes as Most Wicked Man In The World #wicked #publicenemy1", 'negative')


In [6]:
# from one of my statements above I can see that there is some extra spaces after certain words
# I want to clean this up by removing the extra spaces + a few other processing techniques

def pre_process(text):
    print("original:", text)
    # text = remove_spaces(text)
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) 
    text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text)  
    tokens = re.split(r"\s+", text)
    text = text.lower()
    stop_words = set(stopwords.words('english')) 
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    # print("tokenising:", tokens)
    return tokens

In [7]:
# I want to test if my function works
pre_process("Literally so excited I'm going to a Sam Smith concert in October")

original: Literally so excited I'm going to a Sam Smith concert in October


['literally', 'excited', "'", 'going', 'sam', 'smith', 'concert', 'october']

## Basic Feature Extraction

In [8]:
global_feature_dict = {} # A global dictionary of features
i = 0

def to_feature_vector(tokens, training=False):

    local_feature_dict = {}
    
    for x in tokens:
        
        if x in global_feature_dict:
            i = global_feature_dict[x]
        else:
                i = len(global_feature_dict) + 1
                global_feature_dict[x] = i
        
        if i in local_feature_dict:
            local_feature_dict[i] += 1/len(tokens)
        else:
            local_feature_dict[i] = 1/len(tokens)
            
    return local_feature_dict


In [9]:
text = "So Fidel Castro has died.  Don't worry, George Soros is willing to fill his shoes as Most Wicked Man In The World #wicked #publicenemy1"
print(to_feature_vector(pre_process(text), training=True))
print(global_feature_dict)

original: So Fidel Castro has died.  Don't worry, George Soros is willing to fill his shoes as Most Wicked Man In The World #wicked #publicenemy1
{1: 0.058823529411764705, 2: 0.058823529411764705, 3: 0.058823529411764705, 4: 0.058823529411764705, 5: 0.058823529411764705, 6: 0.058823529411764705, 7: 0.058823529411764705, 8: 0.058823529411764705, 9: 0.058823529411764705, 10: 0.058823529411764705, 11: 0.058823529411764705, 12: 0.058823529411764705, 13: 0.058823529411764705, 14: 0.058823529411764705, 15: 0.058823529411764705, 16: 0.058823529411764705, 17: 0.058823529411764705}
{'fidel': 1, 'castro': 2, 'died': 3, '.': 4, "'": 5, 'worry': 6, ',': 7, 'george': 8, 'soros': 9, 'willing': 10, 'fill': 11, 'shoes': 12, 'wicked': 13, 'man': 14, 'world': 15, '#wicked': 16, '#publicenemy1': 17}


In [10]:
# Training and Validating our Classifier

def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

## Cross-validation

In [11]:
def cross_validate(dataset, folds):
    results = []
    fold_size = int(len(dataset)/folds) + 1
    
    for i in range(0,len(dataset),int(fold_size)):
        
        train_fold = dataset[:i] + dataset[i + fold_size:]
        validation_fold = dataset[i : i + fold_size]

        classifier = train_classifier(train_fold)
        
        validation_statements = [x[0] for x in validation_fold]
        correct_outputs = [x[1] for x in validation_fold]

        predicted_outputs = predict_labels(validation_statements, classifier)
        
        classifications = ["positive", "negative"]
        report = classification_report(correct_outputs, predicted_outputs, target_names=classifications)
        print("Fold start on items %d - %d" % (i, i + fold_size))
        print(report)
        
        (precision, recall, f1, _) = precision_recall_fscore_support(correct_outputs, predicted_outputs, average="macro")
        accuracy = accuracy_score(correct_outputs, predicted_outputs)
        
        results.append(( precision, recall, f1, accuracy))
        
    cv_results = []
    for x in range(4):
        sum = 0
        for y in results:
            sum += y[x]
        cv_results.append(sum/len(results))

    return cv_results

In [12]:
# Predicting Labels Given a Classifier

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [13]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'sentiment-dataset.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 33540 rawData, 0 trainData, 0 testData
Preparing training and test data...
original: Literally so excited I'm going to a Sam Smith concert in October
original: @AngryRaiderFan I know. This, TPP, expanded wars and drone strikes, mass surveillance, on and on...
original: @rinashah I have been using Moto G 2nd Gen for over a month now and it's an absolute delight. Stock Android. Good design. Best.
original: Juan  Just heard Green Day's 'Time of our life' for the 1st time since leaving florida and i burst into tears. I miss everyone...  Kellogg
original: So Fidel Castro has died.  Don't worry, George Soros is willing to fill his shoes as Most Wicked Man In The World #wicked #publicenemy1
original: cried for every episode of Dream High 2 starting from episode 13!!! T-T tomorrow i shall watch the last and final episode!
original: Happy birthday gay, see you Monday. IMMA FUCKING JUMP ON YOU OKAY! Have a good day my love, love

In [14]:
cross_validate(train_data, 10)  # will work and output overall performance of p, r, f-score when cv implemented

Training Classifier...




Fold start on items 0 - 2684
              precision    recall  f1-score   support

    positive       0.81      0.55      0.65       818
    negative       0.83      0.94      0.88      1866

    accuracy                           0.82      2684
   macro avg       0.82      0.75      0.77      2684
weighted avg       0.82      0.82      0.81      2684

Training Classifier...




Fold start on items 2684 - 5368
              precision    recall  f1-score   support

    positive       0.74      0.43      0.54       593
    negative       0.86      0.96      0.90      2091

    accuracy                           0.84      2684
   macro avg       0.80      0.69      0.72      2684
weighted avg       0.83      0.84      0.82      2684

Training Classifier...




Fold start on items 5368 - 8052
              precision    recall  f1-score   support

    positive       0.80      0.71      0.75      1045
    negative       0.83      0.88      0.85      1639

    accuracy                           0.82      2684
   macro avg       0.81      0.80      0.80      2684
weighted avg       0.81      0.82      0.81      2684

Training Classifier...




Fold start on items 8052 - 10736
              precision    recall  f1-score   support

    positive       0.87      0.79      0.83      1330
    negative       0.81      0.88      0.84      1354

    accuracy                           0.84      2684
   macro avg       0.84      0.84      0.84      2684
weighted avg       0.84      0.84      0.84      2684

Training Classifier...




Fold start on items 10736 - 13420
              precision    recall  f1-score   support

    positive       0.82      0.65      0.72       921
    negative       0.83      0.92      0.88      1763

    accuracy                           0.83      2684
   macro avg       0.82      0.78      0.80      2684
weighted avg       0.83      0.83      0.82      2684

Training Classifier...




Fold start on items 13420 - 16104
              precision    recall  f1-score   support

    positive       0.88      0.62      0.72       931
    negative       0.82      0.95      0.88      1753

    accuracy                           0.84      2684
   macro avg       0.85      0.79      0.80      2684
weighted avg       0.84      0.84      0.83      2684

Training Classifier...




Fold start on items 16104 - 18788
              precision    recall  f1-score   support

    positive       0.84      0.64      0.73       898
    negative       0.84      0.94      0.89      1786

    accuracy                           0.84      2684
   macro avg       0.84      0.79      0.81      2684
weighted avg       0.84      0.84      0.83      2684

Training Classifier...




Fold start on items 18788 - 21472
              precision    recall  f1-score   support

    positive       0.83      0.65      0.73       908
    negative       0.84      0.93      0.88      1776

    accuracy                           0.84      2684
   macro avg       0.84      0.79      0.81      2684
weighted avg       0.84      0.84      0.83      2684

Training Classifier...




Fold start on items 21472 - 24156
              precision    recall  f1-score   support

    positive       0.82      0.65      0.73       910
    negative       0.84      0.93      0.88      1774

    accuracy                           0.83      2684
   macro avg       0.83      0.79      0.80      2684
weighted avg       0.83      0.83      0.83      2684

Training Classifier...




Fold start on items 24156 - 26840
              precision    recall  f1-score   support

    positive       0.85      0.64      0.73       929
    negative       0.83      0.94      0.88      1747

    accuracy                           0.84      2676
   macro avg       0.84      0.79      0.81      2676
weighted avg       0.84      0.84      0.83      2676



[0.8280960937668814,
 0.7801422040169822,
 0.7952421776701875,
 0.8324770716798211]

## Error Analysis

In [15]:
# a function to make the confusion matrix readable and pretty
def confusion_matrix_heatmap(y_test, preds, labels):
    """Function to plot a confusion matrix"""
    # pass labels to the confusion matrix function to ensure right order
    # cm = metrics.confusion_matrix(y_test, preds, labels)
    cm = metrics.confusion_matrix(y_test, preds, labels=labels)
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion matrix of the classifier')
    fig.colorbar(cax)
    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))
    ax.set_xticklabels( labels, rotation=45)
    ax.set_yticklabels( labels)

    for i in range(len(cm)):
        for j in range(len(cm)):
            text = ax.text(j, i, cm[i, j],
                           ha="center", va="center", color="w")

    plt.xlabel('Predicted')
    plt.ylabel('True')
    
    # fix for mpl bug that cuts off top/bottom of seaborn viz:
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    plt.show() # ta-da!
    plt.show()

## Optimising pre-processing and feature extraction

In [16]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = False  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])