In [1]:
import re
import pickle
import joblib
import pandas as pd

from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import nltk
from tqdm.notebook import tqdm

nltk.download("omw-1.4")
nltk.download("wordnet")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\siddharth_black_pred\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siddharth_black_pred\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv(
    "training.1600000.processed.noemoticon.csv",
    encoding=DATASET_ENCODING,
    names=DATASET_COLUMNS,
)

# Removing the unnecessary columns.
dataset = dataset[["sentiment", "text"]]

In [3]:
dataset["sentiment"] = dataset["sentiment"].replace(4, 1)

# Storing data in lists.
text, sentiment = list(dataset["text"]), list(dataset["sentiment"])

In [4]:
# Defining dictionary containing all emojis with their meanings.
emojis = {
    ":)": "smile",
    ":-)": "smile",
    ";d": "wink",
    ":-E": "vampire",
    ":(": "sad",
    ":-(": "sad",
    ":-<": "sad",
    ":P": "raspberry",
    ":O": "surprised",
    ":-@": "shocked",
    ":@": "shocked",
    ":-$": "confused",
    ":\\": "annoyed",
    ":#": "mute",
    ":X": "mute",
    ":^)": "smile",
    ":-&": "confused",
    "$_$": "greedy",
    "@@": "eyeroll",
    ":-!": "confused",
    ":-D": "smile",
    ":-0": "yell",
    "O.o": "confused",
    "<(-_-)>": "robot",
    "d[-_-]b": "dj",
    ":'-)": "sadsmile",
    ";)": "wink",
    ";-)": "wink",
    "O:-)": "angel",
    "O*-)": "angel",
    "(:-D": "gossip",
    "=^.^=": "cat",
}

## Defining set containing all stopwords in english.
stopwords = [
    "a",
    "about",
    "above",
    "after",
    "again",
    "ain",
    "all",
    "am",
    "an",
    "and",
    "any",
    "are",
    "as",
    "at",
    "be",
    "because",
    "been",
    "before",
    "being",
    "below",
    "between",
    "both",
    "by",
    "can",
    "d",
    "did",
    "do",
    "does",
    "doing",
    "down",
    "during",
    "each",
    "few",
    "for",
    "from",
    "further",
    "had",
    "has",
    "have",
    "having",
    "he",
    "her",
    "here",
    "hers",
    "herself",
    "him",
    "himself",
    "his",
    "how",
    "i",
    "if",
    "in",
    "into",
    "is",
    "it",
    "its",
    "itself",
    "just",
    "ll",
    "m",
    "ma",
    "me",
    "more",
    "most",
    "my",
    "myself",
    "now",
    "o",
    "of",
    "on",
    "once",
    "only",
    "or",
    "other",
    "our",
    "ours",
    "ourselves",
    "out",
    "own",
    "re",
    "s",
    "same",
    "she",
    "shes",
    "should",
    "shouldve",
    "so",
    "some",
    "such",
    "t",
    "than",
    "that",
    "thatll",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "there",
    "these",
    "they",
    "this",
    "those",
    "through",
    "to",
    "too",
    "under",
    "until",
    "up",
    "ve",
    "very",
    "was",
    "we",
    "were",
    "what",
    "when",
    "where",
    "which",
    "while",
    "who",
    "whom",
    "why",
    "will",
    "with",
    "won",
    "y",
    "you",
    "youd",
    "youll",
    "youre",
    "youve",
    "your",
    "yours",
    "yourself",
    "yourselves",
]

In [5]:
lemmatizer = WordNetLemmatizer()


def preprocess(textdata):
    preprocessed_texts = []
    url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
    user_pattern = r"@[^\s]+"
    alpha_pattern = "[^a-zA-Z0-9]"
    sequence_pattern = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    for tweet in tqdm(textdata):
        # lower a individual tweet
        tweet = tweet.lower()
        # replace all the URL's with URL
        tweet = re.sub(url_pattern, " URL", tweet)

        # replace all the emojies
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
        # replace all the usernames with USER
        tweet = re.sub(user_pattern, " USER", tweet)
        # replace all non alphabets
        tweet = re.sub(alpha_pattern, " ", tweet)
        # replace 3 or more consicitive letters with 2 letters
        tweet = re.sub(sequence_pattern, seq_replace_pattern, tweet)

        preprocessed_words = []
        for word in tweet.split():
            # check if the word is a stopword
            if len(word) > 1 and word not in stopwords:
                word = lemmatizer.lemmatize(word)
                preprocessed_words.append(word)

        preprocessed_texts.append(" ".join(preprocessed_words))

    return preprocessed_texts

In [6]:
processed_text = preprocess(text)


  0%|          | 0/1600000 [00:00<?, ?it/s]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_text, sentiment, test_size=0.05, random_state=0
)

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=500000)
vectorizer.fit(X_train)

In [9]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [10]:
def evaluate_model(model):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [11]:
bb_model = BernoulliNB(alpha=2)
bb_model.fit(X_train, y_train)
evaluate_model(bb_model)

              precision    recall  f1-score   support

           0       0.80      0.79      0.80     39989
           1       0.79      0.81      0.80     40011

    accuracy                           0.80     80000
   macro avg       0.80      0.80      0.80     80000
weighted avg       0.80      0.80      0.80     80000



In [12]:
svc_model = LinearSVC()
svc_model.fit(X_train, y_train)
evaluate_model(svc_model)

              precision    recall  f1-score   support

           0       0.81      0.79      0.80     39989
           1       0.80      0.81      0.81     40011

    accuracy                           0.80     80000
   macro avg       0.80      0.80      0.80     80000
weighted avg       0.80      0.80      0.80     80000



In [13]:
# lr_model = LogisticRegression(C = 2, max_iter=1000, n_jobs=-1)
# lr_model.fit(X_train,y_train)
# evaluate_model(lr_model)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_text, sentiment, test_size=0.05, random_state=0
)
pipe = Pipeline([("vectorizer", vectorizer), ("bnb", bb_model)])

pipe.fit(X_train, y_train)
evaluate_model(pipe)

              precision    recall  f1-score   support

           0       0.80      0.79      0.80     39989
           1       0.79      0.81      0.80     40011

    accuracy                           0.80     80000
   macro avg       0.80      0.80      0.80     80000
weighted avg       0.80      0.80      0.80     80000



In [15]:
with open("pipeline.pickle", "wb") as f:
    pickle.dump(pipe, f)

with open("pipeline.pickle", "rb") as f:
    loaded_pipe = pickle.load(f)

evaluate_model(loaded_pipe)

              precision    recall  f1-score   support

           0       0.80      0.79      0.80     39989
           1       0.79      0.81      0.80     40011

    accuracy                           0.80     80000
   macro avg       0.80      0.80      0.80     80000
weighted avg       0.80      0.80      0.80     80000



In [17]:


def predict(model, text): 
    pred_to_label = {0: "negative", 1: "positive"}
    preprocessed_text = preprocess(text) 

    predictions = model.predict(preprocessed_text) 

    data = [] 
    for t, pred in zip(text, predictions): 
        data.append([t, pred, pred_to_label[pred]])
    
    return data

In [18]:
if __name__ == "__main__": 
    text = ["I hate twitter",
            "May the Force be with you.",
            "Mr. Stark, I don't feel so good"]
    
    predictions = predict(loaded_pipe, text) 
    print(predictions)


  0%|          | 0/3 [00:00<?, ?it/s]

[['I hate twitter', 0, 'negative'], ['May the Force be with you.', 1, 'positive'], ["Mr. Stark, I don't feel so good", 0, 'negative']]
