In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer as porterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
data_obama = pd.read_excel('training-Obama-Romney-tweets.xlsx', 'Obama', header=1)
data_obama.rename(columns = {'Unnamed: 1':'date', 'Unnamed: 2':'time', 
                            '1: positive, -1: negative, 0: neutral, 2: mixed':'Anootated tweet'}, inplace = True)

In [3]:
data_obama = data_obama.drop(['Unnamed: 0','date', 'time', 'Your class', 'Unnamed: 6'], axis=1)

In [4]:
data_obama['Class'] = data_obama['Class'].apply(str)
# print(data_obama[data_obama['Class']=='irrevelant'].count())
# print(data_obama[data_obama['Class']=='irrelevant'].count())
# print(data_obama[data_obama['Class']=='2'].count())
# print(data_obama[data_obama['Class']=='1'].count())
# print(data_obama[data_obama['Class']=='0'].count())
# print(data_obama[data_obama['Class']=='-1'].count())

data_obama = data_obama[(data_obama['Class'] == '1') | (data_obama['Class'] == '0') | (data_obama['Class'] == '-1')]



In [5]:
data_obama['Class'] = data_obama['Class'].apply(int)
data_obama = data_obama.dropna()

In [6]:
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

In [7]:
def removeStopWords(tweet):
    
    filtered_tweet = [];

    #stemming
    porter = porterStemmer()
    stemmedTweet = [porter.stem(word) for word in tweet.split(" ")]
    stemmedTweet = " ".join(stemmedTweet)
    tweet = str(stemmedTweet);
    
    tweet = tweet.replace("'", "");
    
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union(['RT'])
    
    word_tokens = word_tokenize(tweet)
    
    for w in word_tokens:
        if w not in stop_words:
            filtered_tweet.append(w)
    
    eachTweet = " ".join(filtered_tweet)  
    
    return eachTweet

In [8]:
def lemmatization(tweet):
    
    tweet_list = tweet.split()
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
        
    eachTweet = " ".join(normalized_tweet) 
    
    return eachTweet

In [9]:
def preprocess_tweet(row):
    
    text = row['Anootated tweet']
    
    # HTML Decoding
    soup = BeautifulSoup(unescape(text), 'lxml')
    text = soup.text
    
    # Remove emojis
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    text = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', text)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    text = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', text)
    # Love -- <3, :*
    text = re.sub(r'(<3|:\*)', ' EMO_POS ', text)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    text = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', text)
    # Sad -- :-(, : (, :(, ):, )-:
    text = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', text)
    # Cry -- :,(, :'(, :"(
    text = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', text)
    
    # Removing @
    text = re.sub(r'@[A-Za-z0-9]+','',text)
    
    # Remove URL links
    text = re.sub('https?://[A-Za-z0-9./]+','',text)
    text = re.sub(r'www.[^ ]+', '', text)
    
    # Lower Case
    text = text.lower()
    
    #Remove words with repetition greater than 2
    word = re.sub(r'(.)\1+', r'\1\1', text)
    
    # Remove negative patterns
    text = neg_pattern.sub(lambda x: negations_dic[x.group()], text)
    
    # Remove Hashtags & Numbers
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # remove extra white spaces
    text = re.sub(r'\s+', r' ', text)
    
    text = removeStopWords(text)
    
    text = lemmatization(text)
    
    return text

In [10]:
data_obama['Anootated tweet'] = data_obama.apply(preprocess_tweet, axis=1)

In [11]:
#Split the data into training, validation and test dataset

x = data_obama['Anootated tweet']
y = data_obama['Class']
SEED = 2000

# Split data
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [12]:
print(len(x_train))
print(len(x_validation))
print(len(x_test))

5061
281
282


In [13]:
#Set TF-IDF params

n_features=100000
ngram_range=(1,3)
stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(['rt'])


tvec = TfidfVectorizer()
tvec.set_params(stop_words=None, max_features=n_features, ngram_range=ngram_range)

TfidfVectorizer(max_features=100000, ngram_range=(1, 3))

In [14]:
target_class = ['-1', '0', '1']

## Multinomial NB

In [15]:
multinomialNBClassifier = MultinomialNB();
checker_pipeline = Pipeline([
            ('vectorizer', tvec),
            ('classifier', multinomialNBClassifier)
        ])
sentiment_fit = checker_pipeline.fit(x_train, y_train)


In [16]:
y_pred = sentiment_fit.predict(x_validation)

In [17]:
print(accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_class))

0.6263345195729537
              precision    recall  f1-score   support

          -1       0.58      0.73      0.65        92
           0       0.62      0.52      0.57       102
           1       0.69      0.64      0.67        87

    accuracy                           0.63       281
   macro avg       0.63      0.63      0.63       281
weighted avg       0.63      0.63      0.62       281



In [18]:
y_pred_test = sentiment_fit.predict(x_test)
print(accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=target_class))

0.6418439716312057
              precision    recall  f1-score   support

          -1       0.63      0.76      0.69       102
           0       0.64      0.52      0.57        97
           1       0.65      0.64      0.65        83

    accuracy                           0.64       282
   macro avg       0.64      0.64      0.64       282
weighted avg       0.64      0.64      0.64       282



## Simple Linear SVC

In [19]:

clf = LinearSVC()

checker_pipeline = Pipeline([
            ('vectorizer', tvec),
            ('classifier', clf)
        ])
sentiment_fit = checker_pipeline.fit(x_train, y_train)


In [20]:
y_pred = sentiment_fit.predict(x_validation)
print(accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_class))

0.6227758007117438
              precision    recall  f1-score   support

          -1       0.65      0.68      0.67        92
           0       0.62      0.47      0.53       102
           1       0.60      0.74      0.66        87

    accuracy                           0.62       281
   macro avg       0.62      0.63      0.62       281
weighted avg       0.62      0.62      0.62       281



In [21]:
y_pred_test = sentiment_fit.predict(x_test)
print(accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=target_class))

0.6418439716312057
              precision    recall  f1-score   support

          -1       0.68      0.74      0.71       102
           0       0.66      0.47      0.55        97
           1       0.59      0.72      0.65        83

    accuracy                           0.64       282
   macro avg       0.64      0.64      0.64       282
weighted avg       0.65      0.64      0.64       282



## Ridge Classifier

In [22]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier()

checker_pipeline = Pipeline([
            ('vectorizer', tvec),
            ('classifier', clf)
        ])

sentiment_fit = checker_pipeline.fit(x_train, y_train)

In [23]:
y_pred = sentiment_fit.predict(x_validation)
print(accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_class))

0.6298932384341637
              precision    recall  f1-score   support

          -1       0.63      0.71      0.67        92
           0       0.65      0.47      0.55       102
           1       0.62      0.74      0.67        87

    accuracy                           0.63       281
   macro avg       0.63      0.64      0.63       281
weighted avg       0.63      0.63      0.62       281



In [24]:
y_pred_test = sentiment_fit.predict(x_test)
print(accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=target_class))

0.648936170212766
              precision    recall  f1-score   support

          -1       0.68      0.75      0.71       102
           0       0.68      0.48      0.57        97
           1       0.59      0.72      0.65        83

    accuracy                           0.65       282
   macro avg       0.65      0.65      0.64       282
weighted avg       0.65      0.65      0.64       282



In [25]:
data_obama_test = pd.read_excel('test.xlsx', 'Obama')

In [26]:
data_obama_test.head()

Unnamed: 0,Anootated tweet,Class
0,I'd like to punch <e>Obama</e> in the face. So...,-1
1,<e>Obama</e> wants to tax foreign earnings. Th...,-1
2,<e>Obama</e> has to maintain his professionali...,1
3,I hate <e>Obama</e> with a BURNING PASSION #de...,-1
4,I don't like<e>Obama</e> because his stupid <a...,-1


In [27]:
data_obama_test['Class'] = data_obama_test['Class'].apply(str)
# print(data_obama[data_obama['Class']=='irrevelant'].count())
# print(data_obama[data_obama['Class']=='irrelevant'].count())
# print(data_obama[data_obama['Class']=='2'].count())
# print(data_obama[data_obama['Class']=='1'].count())
# print(data_obama[data_obama['Class']=='0'].count())
# print(data_obama[data_obama['Class']=='-1'].count())

data_obama_test = data_obama_test[(data_obama_test['Class'] == '1') | (data_obama_test['Class'] == '0') | (data_obama_test['Class'] == '-1')]



In [28]:
data_obama_test.head()

Unnamed: 0,Anootated tweet,Class
0,I'd like to punch <e>Obama</e> in the face. So...,-1
1,<e>Obama</e> wants to tax foreign earnings. Th...,-1
2,<e>Obama</e> has to maintain his professionali...,1
3,I hate <e>Obama</e> with a BURNING PASSION #de...,-1
4,I don't like<e>Obama</e> because his stupid <a...,-1


In [29]:
data_obama_test.count()

Anootated tweet    1951
Class              1951
dtype: int64

In [30]:
data_obama_test['Class'] = data_obama_test['Class'].apply(int)
data_obama_test = data_obama_test.dropna()

In [31]:
data_obama_test.count()

Anootated tweet    1951
Class              1951
dtype: int64

In [32]:
data_obama_test['Anootated tweet'] = data_obama_test.apply(preprocess_tweet, axis=1)

In [33]:
y_pred_test = sentiment_fit.predict(data_obama_test['Anootated tweet'])
print(accuracy_score(data_obama_test['Class'], y_pred_test))
print(classification_report(data_obama_test['Class'], y_pred_test, target_names=target_class))

0.566888774987186
              precision    recall  f1-score   support

          -1       0.57      0.65      0.61       688
           0       0.56      0.48      0.52       681
           1       0.57      0.57      0.57       582

    accuracy                           0.57      1951
   macro avg       0.57      0.57      0.56      1951
weighted avg       0.57      0.57      0.56      1951

