In [None]:
import pandas as pd
import numpy as np
import re
import string


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, index_col=False, header=None, sep=',', names=cols)
    return dataset

In [None]:
#Remove the columns we don't care about
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        try:
            del dataset[col]
        except:
            print(f'Column {col} already removed')
    return dataset

In [None]:
def preprocess_tweet_text(tweet, useStemmer=False, useLemmatizer=False):
    tweet.lower()
    
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    
    filtered_words = [w for w in tweet_tokens if not w in stopwords.words('english')]
    
    if useStemmer is True: 
        ps = PorterStemmer()
        stemmed_words = [ps.stem(w) for w in filtered_words]
        # Replace the words with these new ones
        filtered_words = stemmed_words

    if useLemmatizer is True:
        lemmatizer = WordNetLemmatizer()
        lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in filtered_words]
        # Replace the words
        filtered_words = lemma_words
    
    return " ".join(filtered_words)


In [None]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [None]:
def int_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"
    

In [None]:
def TestDataSet(filename, useStem=False, useLema=False):
    # Load dataset
    dataset = load_dataset(filename, ['score', 'id', 'created_at', 'query', 'user', 'text'])
    # Remove unwanted columns from dataset
    dataset = remove_unwanted_cols(dataset, ['id', 'created_at', 'query', 'user'])
    # Clean up the text using different options
    dataset.text = dataset['text'].apply(preprocess_tweet_text, useStemmer=useStem, useLemmatizer=useLema)
    # Split dataset into Train, Test

    # Same tf vector will be used for Testing sentiments on unseen trending data
    tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
    X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
    y = np.array(dataset.iloc[:, 0]).ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

    # Training Naive Bayes model
    NB_model = MultinomialNB()
    NB_model.fit(X_train, y_train)
    y_predict_nb = NB_model.predict(X_test)
    print(f'Naive Bayes Score: \t\t{accuracy_score(y_test, y_predict_nb)}')

    # Training Logistics Regression model
    LR_model = LogisticRegression(solver='lbfgs')
    LR_model.fit(X_train, y_train)
    y_predict_lr = LR_model.predict(X_test)
    print(f'Logistic Regression Score: \t{accuracy_score(y_test, y_predict_lr)}')
    print(' ')
    dataset = None


In [None]:
print('Stemming OFF\tLematizer OFF')
TestDataSet('data/testdata.manual.2009.06.14.csv', False, False)

print('Stemming ON \tLematizer OFF')
TestDataSet('data/testdata.manual.2009.06.14.csv', True, False)

print('Stemming OFF \tLematizer ON')
TestDataSet('data/testdata.manual.2009.06.14.csv', False, True)

print('Stemming ON \tLematizer ON')
TestDataSet('data/testdata.manual.2009.06.14.csv', True, True)
