##### Definition of the Cleaner class

In [195]:
import re
import unicodedata


def remove_punctuation(string_data: str):
    pattern_punctuation = re.compile(r"[\-\"`$%^&*(|)/~\[\]{}:;+,._='?!]+")
    return pattern_punctuation.sub('', string_data)


def remove_small_tweets(xs, ys):
    # ys = [ys[i] for i in range(0, len(ys)) if len(xs[i]) > 3]
    # xs = [x for x in xs if len(x) > 3]
    return xs, ys


class Cleaner:
    def __init__(self):
        self.__regex_dict = {
            'URL': r"""(?xi)\b(?:(?:https?|ftp|file):\/\/|www\.|ftp\.|pic\.|twitter\.|facebook\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:;,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:;,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])""",
            'EMOJI': u'([\U0001F1E0-\U0001F1FF])|([\U0001F300-\U0001F5FF])|([\U0001F600-\U0001F64F])|([\U0001F680-\U0001F6FF])|([\U0001F700-\U0001F77F])|([\U0001F800-\U0001F8FF])|([\U0001F900-\U0001F9FF])|([\U0001FA00-\U0001FA6F])|([\U0001FA70-\U0001FAFF])|([\U00002702-\U000027B0])|([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])',
            'HASHTAG': r"\#\b[\w\-\_]+\b",
            'EMAIL': r"(?:^|(?<=[^\w@.)]))(?:[\w+-](?:\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(?:\.(?:[a-z]{2,})){1,3}(?:$|(?=\b))",
            'MENTION': r"@[A-Za-z0-9]+",
            'CASHTAG': r"(?:[$\u20ac\u00a3\u00a2]\d+(?:[\\.,']\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?)|(?:\d+(?:[\\.,']\\d+)?[$\u20ac\u00a3\u00a2])",
            'DATE': r"(?:(?:(?:(?:(?<!:)\b\'?\d{1,4},? ?)?\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b))|(?:(?:(?<!:)\b\\'?\d{1,4},? ?)\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b)?))|(?:\b(?<!\d\\.)(?:(?:(?:[0123]?[0-9][\\.\\-\\/])?[0123]?[0-9][\\.\\-\\/][12][0-9]{3})|(?:[0123]?[0-9][\\.\\-\\/][0123]?[0-9][\\.\\-\\/][12]?[0-9]{2,3}))(?!\.\d)\b))",
            'TIME': r'(?:(?:\d+)?\\.?\d+(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))?)',
            'EMPHASIS': r"(?:\*\b\w+\b\*)",
            'ELONG': r"\b[A-Za-z]*([a-zA-Z])\1\1[A-Za-z]*\b"
        }

        self.__regexes = {k: re.compile(self.__regex_dict[k]) for k, v in self.__regex_dict.items()}

        self.__contraction_mapping = {"’": "'", "RT ": " ", "ain't": "is not", "aren't": "are not", "can't": "can not",
                                      "'cause": "because", "could've": "could have",
                                      "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
                                      "don't": "do not", "hadn't": "had not",
                                      "hasn't": "has not", "haven't": "have not", "he'd": "he would",
                                      "he'll": "he will",
                                      "he's": "he is",
                                      "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                                      "how's": "how is", "I'd": "I would",
                                      "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                                      "I'm": "I am",
                                      "I've": "I have",
                                      "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
                                      "i'll've": "i will have", "i'm": "i am",
                                      "i've": "i have", "isn't": "is not", "it'd": "it would",
                                      "it'd've": "it would have",
                                      "it'll": "it will",
                                      "it'll've": "it will have", "it's": "it is", "it’s": "it is", "let's": "let us",
                                      "ma'am": "madam", "mayn't": "may not",
                                      "might've": "might have", "mightn't": "might not",
                                      "mightn't've": "might not have",
                                      "must've": "must have",
                                      "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                                      "needn't've": "need not have",
                                      "o'clock": "of the clock", "oughtn't": "ought not",
                                      "oughtn't've": "ought not have",
                                      "shan't": "shall not",
                                      "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
                                      "she'd've": "she would have",
                                      "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                                      "should've": "should have",
                                      "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                                      "so's": "so as",
                                      "this's": "this is", "that'd": "that would", "that'd've": "that would have",
                                      "that's": "that is",
                                      "there'd": "there would", "there'd've": "there would have", "there's": "there is",
                                      "here's": "here is",
                                      "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                                      "they'll've": "they will have",
                                      "they're": "they are", "they've": "they have", "to've": "to have",
                                      "wasn't": "was not", "we'd": "we would",
                                      "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                                      "we're": "we are", "we've": "we have",
                                      "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
                                      "what're": "what are",
                                      "what's": "what is", "what've": "what have", "when's": "when is",
                                      "when've": "when have", "where'd": "where did",
                                      "where's": "where is", "where've": "where have", "who'll": "who will",
                                      "who'll've": "who will have",
                                      "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                                      "will've": "will have",
                                      "won't": "will not", "won't've": "will not have", "would've": "would have",
                                      "wouldn't": "would not",
                                      "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                                      "y'all'd've": "you all would have",
                                      "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would",
                                      "you'd've": "you would have",
                                      "you'll": "you will", "you'll've": "you will have", "you're": "you are",
                                      "you've": "you have", "It's": "It is", "You'd": "You would",
                                      ' u ': " you ", 'yrs': 'years', 'FYI': 'For your information', ' im ': ' I am ',
                                      'lol': 'LOL', 'You\'re': 'You are'
            , 'can’t': 'can not', '…': '. ', '...': '. ', '\'\'': '\'', '≠': '', 'ain’t': 'am not', 'I’m': 'I am',
                                      'RT\'s': ''}
        self.__emoticons = {
            ':*': '<kiss>',
            ':-*': '<kiss>',
            ':x': '<kiss>',
            ':-)': '<happy>',
            ':-))': '<happy>',
            ':-)))': '<happy>',
            ':-))))': '<happy>',
            ':-)))))': '<happy>',
            ':-))))))': '<happy>',
            ':)': '<happy>',
            ':))': '<happy>',
            ':)))': '<happy>',
            ':))))': '<happy>',
            ':)))))': '<happy>',
            ':))))))': '<happy>',
            ':)))))))': '<happy>',
            ':o)': '<happy>',
            ':]': '<happy>',
            ':3': '<happy>',
            ':c)': '<happy>',
            ':>': '<happy>',
            '=]': '<happy>',
            '8)': '<happy>',
            '=)': '<happy>',
            ':}': '<happy>',
            ':^)': '<happy>',
            '|;-)': '<happy>',
            ":'-)": '<happy>',
            ":')": '<happy>',
            '\o/': '<happy>',
            '*\\0/*': '<happy>',
            ':-D': '<laugh>',
            ':D': '<laugh>',
            '8-D': '<laugh>',
            '8D': '<laugh>',
            'x-D': '<laugh>',
            'xD': '<laugh>',
            'X-D': '<laugh>',
            'XD': '<laugh>',
            '=-D': '<laugh>',
            '=D': '<laugh>',
            '=-3': '<laugh>',
            '=3': '<laugh>',
            'B^D': '<laugh>',
            '>:[': '<sad>',
            ':-(': '<sad>',
            ':-((': '<sad>',
            ':-(((': '<sad>',
            ':-((((': '<sad>',
            ':-(((((': '<sad>',
            ':-((((((': '<sad>',
            ':-(((((((': '<sad>',
            ':(': '<sad>',
            ':((': '<sad>',
            ':(((': '<sad>',
            ':((((': '<sad>',
            ':(((((': '<sad>',
            ':((((((': '<sad>',
            ':(((((((': '<sad>',
            ':((((((((': '<sad>',
            ':-c': '<sad>',
            ':c': '<sad>',
            ':-<': '<sad>',
            ':<': '<sad>',
            ':-[': '<sad>',
            ':[': '<sad>',
            ':{': '<sad>',
            ':-||': '<sad>',
            ':@': '<sad>',
            ":'-(": '<sad>',
            ":'(": '<sad>',
            'D:<': '<sad>',
            'D:': '<sad>',
            'D8': '<sad>',
            'D;': '<sad>',
            'D=': '<sad>',
            'DX': '<sad>',
            'v.v': '<sad>',
            "D-':": '<sad>',
            '(>_<)': '<sad>',
            ':|': '<sad>',
            '>:O': '<surprise>',
            ':-O': '<surprise>',
            ':-o': '<surprise>',
            ':O': '<surprise>',
            '°o°': '<surprise>',
            'o_O': '<surprise>',
            'o_0': '<surprise>',
            'o.O': '<surprise>',
            'o-o': '<surprise>',
            '8-0': '<surprise>',
            '|-O': '<surprise>',
            ';-)': '<wink>',
            ';)': '<wink>',
            '*-)': '<wink>',
            '*)': '<wink>',
            ';-]': '<wink>',
            ';]': '<wink>',
            ';D': '<wink>',
            ';^)': '<wink>',
            ':-,': '<wink>',
            '>:P': '<tong>',
            ':-P': '<tong>',
            ':P': '<tong>',
            'X-P': '<tong>',
            'x-p': '<tong>',
            ':-p': '<tong>',
            ':p': '<tong>',
            '=p': '<tong>',
            ':-Þ': '<tong>',
            ':Þ': '<tong>',
            ':-b': '<tong>',
            ':b': '<tong>',
            ':-&': '<tong>',
            '>:\\': '<annoyed>',
            '>:/': '<annoyed>',
            ':-/': '<annoyed>',
            ':-.': '<annoyed>',
            ':/': '<annoyed>',
            ':\\': '<annoyed>',
            '=/': '<annoyed>',
            '=\\': '<annoyed>',
            ':L': '<annoyed>',
            '=L': '<annoyed>',
            ':S': '<annoyed>',
            '>.<': '<annoyed>',
            ':-|': '<annoyed>',
            '<:-|': '<annoyed>',
            ':-X': '<seallips>',
            ':X': '<seallips>',
            ':-#': '<seallips>',
            ':#': '<seallips>',
            'O:-)': '<angel>',
            '0:-3': '<angel>',
            '0:3': '<angel>',
            '0:-)': '<angel>',
            '0:)': '<angel>',
            '0;^)': '<angel>',
            '>:)': '<devil>',
            '>:D': '<devil>',
            '>:-D': '<devil>',
            '>;)': '<devil>',
            '>:-)': '<devil>',
            '}:-)': '<devil>',
            '}:)': '<devil>',
            '3:-)': '<devil>',
            '3:)': '<devil>',
            'o/\o': '<highfive>',
            '^5': '<highfive>',
            '>_>^': '<highfive>',
            '^<_<': '<highfive>',
            '<3': '<heart>',
            '^3^': '<smile>',
            "(':": '<smile>',
            " > < ": '<smile>',
            "UvU": '<smile>',
            "uwu": '<smile>',
            'UwU': '<smile>'
        }

    def __get_compiled_regexes(self):
        regexes = {k: re.compile(self.__regex_dict[k]) for k, v in self.__regex_dict.items()}
        return regexes

    def map_contractions(self, string_data: str) -> str:
        for c in self.__contraction_mapping:
            string_data = string_data.replace(c, self.__contraction_mapping[c])
        return string_data

    def map_emoticons(self, string_data: str) -> str:
        for emoticon in self.__emoticons:
            string_data = string_data.replace(emoticon, ' ' + self.__emoticons[emoticon][1:-1] + ' ')
        return string_data

    def delete_emoticons(self, string_data: str) -> str:
        for em in self.__emoticons:
            string_data = string_data.replace(em, '')
        return string_data

    def delete_url(self, string_data: str) -> str:
        pattern_url = self.__regexes['URL']
        return pattern_url.sub('', string_data)

    def default_transform(self, string_data: str) -> str:
        string_data = unicodedata.normalize('NFKD', string_data).encode('ascii', errors='ignore').decode('utf8',
                                                                                                         errors='ignore')
        string_data = string_data.lower()
        for word in self.__contraction_mapping.keys():
            string_data = string_data.replace(word, self.__contraction_mapping[word])
        string_data = remove_punctuation(string_data)
        string_data = re.sub(r'\b([b-hB-Hj-zJ-Z] )', ' ', string_data)
        string_data = re.sub(r'( [b-hB-Hj-zJ-Z])\b', ' ', string_data)
        string_data = re.sub(r'(istj|istp|isfj|isfp|infj|infp|intj|intp|estp|estj|esfp|esfj|enfp|enfj|entp|entj)',
                             '', string_data, flags=re.IGNORECASE)
        string_data = ' '.join(string_data.split())
        return string_data

    def remove_items(self, tags: [str], string_data: str) -> str:
        for tag in tags:
            if tag not in self.__regexes:
                print('wrong tag: ', tag)
            else:
                string_data = self.__regexes[tag].sub('', string_data)
        return string_data

    def map_items(self, tags: [str], string_data: str) -> str:
        for tag in tags:
            if tag not in self.__regexes:
                print('wrong tag: ', tag)
            else:
                string_data = self.__regexes[tag].sub(' <' + tag + '> ', string_data)
        return string_data

    def remove_all_items(self, string_data: str) -> str:
        for _, reg in self.__regexes.items():
            string_data = reg.sub('', string_data)
        return string_data

In [196]:
cleaner = Cleaner()

##### Load data from .csv files

In [197]:
import pandas as pd
from os import path

data_dir = "data"
train_file_name = "Train.csv"
test_file_name = "Test.csv"

train_full_path = path.join(data_dir, train_file_name)
test_full_path = path.join(data_dir, test_file_name)


def get_data(path_to_csv: str, ys_index: int):
    input_data = pd.read_csv(path_to_csv)
    xs = input_data[input_data.columns[-1]]
    ys = input_data[input_data.columns[ys_index]]
    return xs, ys


def get_sentiment_data(path_to_csv: str):
    xs, ys = get_data(path_to_csv, 1)
    xs = [xs[i] for i in range(0, len(xs)) if ys[i] != "irrelevant"]
    ys = [y for y in ys if y != "irrelevant"]
    return xs, ys


def get_topic_data(path_to_csv: str):
    return get_data(path_to_csv, 0)


sentiment_xs_train, sentiment_ys_train = get_sentiment_data(train_full_path)
sentiment_xs_test, sentiment_ys_test = get_sentiment_data(test_full_path)
assert len(sentiment_xs_train) == len(sentiment_ys_train)
assert len(sentiment_xs_test) == len(sentiment_ys_test)
print("Sentiment data loaded")

topic_xs_train, topic_ys_train = get_topic_data(train_full_path)
topic_xs_test, topic_ys_test = get_topic_data(test_full_path)
assert len(topic_xs_train) == len(topic_ys_train)
assert len(topic_xs_test) == len(topic_ys_test)
print("Topic data loaded")

Sentiment data loaded
Topic data loaded


##### Prepare different initial data for research

In [198]:
topic_xs_train_0 = [
    cleaner.default_transform(x)
    for x in topic_xs_train
]

topic_xs_train_1 = [
    cleaner.remove_items(['EMOJI', 'EMAIL', 'CASHTAG', 'DATE', 'EMPHASIS'], x)
    for x in topic_xs_train_0
]

topic_xs_train_1, topic_ys_train_1 = remove_small_tweets(topic_xs_train_1, topic_ys_train)

topic_xs_train_2 = [
    cleaner.remove_all_items(x)
    for x in topic_xs_train
]

topic_xs_test_0 = [
    cleaner.default_transform(x)
    for x in topic_xs_test
]

topic_xs_test_1 = [
    cleaner.remove_items(['EMOJI', 'EMAIL', 'CASHTAG', 'DATE', 'EMPHASIS'], x)
    for x in topic_xs_test_0
]

topic_xs_test_1, topic_ys_test_1 = remove_small_tweets(topic_xs_test_1, topic_ys_test)

topic_xs_test_2 = [
    cleaner.remove_all_items(x)
    for x in topic_xs_test_0
]

sentiment_xs_train_0 = [
    cleaner.default_transform(x)
    for x in sentiment_xs_train
]

sentiment_xs_train_1 = [
    cleaner.map_emoticons(
        cleaner.map_items(['URL', 'EMOJI', 'HASHTAG', 'EMAIL', 'MENTION', 'CASHTAG', 'DATE', 'TIME', 'EMPHASIS'],
                          cleaner.map_contractions(x)))
    for x in sentiment_xs_train_0
]

sentiment_xs_train_1, sentiment_ys_train_1 = remove_small_tweets(sentiment_xs_train_1, sentiment_ys_train)

sentiment_xs_test_0 = [
    cleaner.default_transform(x)
    for x in sentiment_xs_test
]

sentiment_xs_test_1 = [
    cleaner.map_emoticons(
        cleaner.remove_items(['URL', 'EMOJI', 'HASHTAG', 'EMAIL', 'MENTION', 'CASHTAG', 'DATE', 'TIME', 'EMPHASIS'],
                             cleaner.map_contractions(x)))
    for x in sentiment_xs_test_0
]

sentiment_xs_test_1, sentiment_ys_test_1 = remove_small_tweets(sentiment_xs_test_1, sentiment_ys_test)

##### Train Linear Support Vector Classificator and prepare pipelines for sentiment and topic prediction

In [199]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

sentiment_classifier = LinearSVC()
pipeline_sentiment = Pipeline([('tfidf', TfidfVectorizer()), ('clf', sentiment_classifier)], verbose=True)
pipeline_sentiment.fit(sentiment_xs_train, sentiment_ys_train)

sentiment_classifier_0 = LinearSVC()
pipeline_sentiment_0 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', sentiment_classifier_0)], verbose=True)
pipeline_sentiment_0.fit(sentiment_xs_train_0, sentiment_ys_train)

sentiment_classifier_1 = LinearSVC()
pipeline_sentiment_1 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', sentiment_classifier_1)], verbose=True)
pipeline_sentiment_1.fit(sentiment_xs_train_1, sentiment_ys_train_1)

topic_classifier_2 = LinearSVC()
pipeline_topic_2 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', topic_classifier_2)], verbose=True)
pipeline_topic_2.fit(topic_xs_train_2, topic_ys_train)

topic_classifier = LinearSVC()
pipeline_topic = Pipeline([('tfidf', TfidfVectorizer()), ('clf', topic_classifier)], verbose=True)
pipeline_topic.fit(topic_xs_train, topic_ys_train)

topic_classifier_0 = LinearSVC()
pipeline_topic_0 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', topic_classifier_0)], verbose=True)
pipeline_topic_0.fit(topic_xs_train_0, topic_ys_train)

topic_classifier_1 = LinearSVC()
pipeline_topic_1 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', topic_classifier_1)], verbose=True)
pipeline_topic_1.fit(topic_xs_train_1, topic_ys_train_1)
print("\nTopic and sentiment pipelines are ready")

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s

Topic

### Make predictions and print classification report

#### Sentiment

In [200]:
from sklearn.metrics import classification_report

sentiment_predictions = pipeline_sentiment.predict(sentiment_xs_test)
print("Sentiment classification w/o preprocessing:")
print(classification_report(sentiment_ys_test, sentiment_predictions))

Sentiment classification w/o preprocessing:
              precision    recall  f1-score   support

    negative       0.67      0.59      0.63        49
     neutral       0.81      0.88      0.84       156
    positive       0.65      0.47      0.55        32

    accuracy                           0.77       237
   macro avg       0.71      0.65      0.67       237
weighted avg       0.76      0.77      0.76       237



In [201]:
from sklearn.metrics import classification_report

sentiment_predictions = pipeline_sentiment_0.predict(sentiment_xs_test_0)
print("Sentiment classification results with default preprocessing::")
print(classification_report(sentiment_ys_test, sentiment_predictions))

Sentiment classification results with default preprocessing::
              precision    recall  f1-score   support

    negative       0.70      0.57      0.63        49
     neutral       0.80      0.90      0.85       156
    positive       0.65      0.47      0.55        32

    accuracy                           0.77       237
   macro avg       0.72      0.65      0.67       237
weighted avg       0.76      0.77      0.76       237



In [202]:
from sklearn.metrics import classification_report

sentiment_predictions = pipeline_sentiment_1.predict(sentiment_xs_test_1)
print("Sentiment classification results with hard preprocessing:")
print(classification_report(sentiment_ys_test_1, sentiment_predictions))

Sentiment classification results with hard preprocessing:
              precision    recall  f1-score   support

    negative       0.70      0.53      0.60        49
     neutral       0.80      0.91      0.85       156
    positive       0.70      0.50      0.58        32

    accuracy                           0.78       237
   macro avg       0.73      0.65      0.68       237
weighted avg       0.77      0.78      0.76       237



#### Topic

In [203]:
topic_predictions = pipeline_topic_2.predict(topic_xs_test_2)
print("Topic classification results with removed all items:")
print(classification_report(topic_ys_test, topic_predictions))

Topic classification results with removed all items:
              precision    recall  f1-score   support

       apple       0.77      0.77      0.77        98
      google       0.80      0.65      0.71        79
   microsoft       0.70      0.64      0.67        78
     twitter       0.59      0.75      0.66        87

    accuracy                           0.70       342
   macro avg       0.72      0.70      0.70       342
weighted avg       0.72      0.70      0.71       342



In [204]:
topic_predictions = pipeline_topic.predict(topic_xs_test)
print("Topic classification results w/o preprocessing:")
print(classification_report(topic_ys_test, topic_predictions))

Topic classification results w/o preprocessing:
              precision    recall  f1-score   support

       apple       0.91      0.95      0.93        98
      google       0.88      0.80      0.83        79
   microsoft       0.85      0.74      0.79        78
     twitter       0.77      0.89      0.82        87

    accuracy                           0.85       342
   macro avg       0.85      0.84      0.85       342
weighted avg       0.85      0.85      0.85       342



In [205]:
topic_predictions = pipeline_topic_0.predict(topic_xs_test_0)
print("Topic classification results with default preprocessing:")
print(classification_report(topic_ys_test, topic_predictions))

Topic classification results with default preprocessing:
              precision    recall  f1-score   support

       apple       0.90      0.95      0.93        98
      google       0.85      0.78      0.82        79
   microsoft       0.87      0.74      0.80        78
     twitter       0.75      0.85      0.80        87

    accuracy                           0.84       342
   macro avg       0.84      0.83      0.83       342
weighted avg       0.84      0.84      0.84       342



In [206]:
topic_predictions = pipeline_topic_1.predict(topic_xs_test_1)
print("Topic classification results with deleted 'EMOJI', 'EMAIL', 'CASHTAG', 'DATE', 'EMPHASIS:")
print(classification_report(topic_ys_test_1, topic_predictions))

Topic classification results with deleted 'EMOJI', 'EMAIL', 'CASHTAG', 'DATE', 'EMPHASIS:
              precision    recall  f1-score   support

       apple       0.90      0.95      0.93        98
      google       0.85      0.78      0.82        79
   microsoft       0.87      0.74      0.80        78
     twitter       0.75      0.85      0.80        87

    accuracy                           0.84       342
   macro avg       0.84      0.83      0.83       342
weighted avg       0.84      0.84      0.84       342



##### Interactive classifier

In [220]:
def print_tweet_result(tweet: str):
    print('Tweet:',
          tweet,
          "\nSentiment: ",
          *pipeline_sentiment_1.predict([tweet]),
          '\nOrganisation: ',
          *pipeline_topic.predict([tweet])
          )

In [221]:
tweet = 'I already hate sitting at the computer'
print_tweet_result(tweet)

Tweet: I already hate sitting at the computer 
Sentiment:  negative 
Organisation:  microsoft
