In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer as porterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
data_romney = pd.read_excel('training-Obama-Romney-tweets.xlsx', 'Romney', header=1)
data_romney.rename(columns = {'Unnamed: 1':'date', 'Unnamed: 2':'time', 
                            '1: positive, -1: negative, 0: neutral, 2: mixed':'Anootated tweet'}, inplace = True)

In [3]:
data_romney = data_romney.drop(['Unnamed: 0','date', 'time', 'Your class label', 'Unnamed: 6'], axis=1)

In [4]:
data_romney['Class'] = data_romney['Class'].apply(str)
# print(data_obama[data_obama['Class']=='irrevelant'].count())
# print(data_obama[data_obama['Class']=='irrelevant'].count())
# print(data_obama[data_obama['Class']=='2'].count())
# print(data_obama[data_obama['Class']=='1'].count())
# print(data_obama[data_obama['Class']=='0'].count())
# print(data_obama[data_obama['Class']=='-1'].count())

data_romney = data_romney[(data_romney['Class'] == '1') | (data_romney['Class'] == '0') | (data_romney['Class'] == '-1')]

In [5]:
data_romney['Class'] = data_romney['Class'].apply(int)
data_romney = data_romney.dropna()

In [6]:
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

In [7]:
def removeStopWords(tweet):
    
    filtered_tweet = [];

    #stemming
    porter = porterStemmer()
    stemmedTweet = [porter.stem(word) for word in tweet.split(" ")]
    stemmedTweet = " ".join(stemmedTweet)
    tweet = str(stemmedTweet);
    
    tweet = tweet.replace("'", "");
    
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union(['RT'])
    
    word_tokens = word_tokenize(tweet)
    
    for w in word_tokens:
        if w not in stop_words:
            filtered_tweet.append(w)
    
    eachTweet = " ".join(filtered_tweet)  
    
    return eachTweet

In [8]:
def lemmatization(tweet):
    
    tweet_list = tweet.split()
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
        
    eachTweet = " ".join(normalized_tweet) 

    
    return eachTweet

In [9]:
def preprocess_tweet(row):
    
    text = row['Anootated tweet']
    
    # HTML Decoding
    soup = BeautifulSoup(unescape(text), 'lxml')
    text = soup.text
    
    # Remove emojis
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    text = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', text)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    text = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', text)
    # Love -- <3, :*
    text = re.sub(r'(<3|:\*)', ' EMO_POS ', text)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    text = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', text)
    # Sad -- :-(, : (, :(, ):, )-:
    text = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', text)
    # Cry -- :,(, :'(, :"(
    text = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', text)
    
    # Removing @
    text = re.sub(r'@[A-Za-z0-9]+','',text)
    
    # Remove URL links
    text = re.sub('https?://[A-Za-z0-9./]+','',text)
    text = re.sub(r'www.[^ ]+', '', text)
    
    # Lower Case
    text = text.lower()
    
    #Remove words with repetition greater than 2
    word = re.sub(r'(.)\1+', r'\1\1', text)
    
    # Remove negative patterns
    text = neg_pattern.sub(lambda x: negations_dic[x.group()], text)
    
    # Remove Hashtags & Numbers
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # remove extra white spaces
    text = re.sub(r'\s+', r' ', text)
    
    #text = removeStopWords(text)
    
    #text = lemmatization(text)
    
    return text

In [10]:
data_romney['Anootated tweet'] = data_romney.apply(preprocess_tweet, axis=1)

In [11]:
#Split the data into training, validation and test dataset

x = data_romney['Anootated tweet']
y = data_romney['Class']
SEED = 2000

# Split data
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [12]:
print(len(x_train))
print(len(x_validation))
print(len(x_test))

5083
282
283


In [13]:
#Set TF-IDF params

n_features=100000
ngram_range=(1,3)
stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(['rt'])


tvec = TfidfVectorizer()
tvec.set_params(stop_words=None, max_features=n_features, ngram_range=ngram_range)

TfidfVectorizer(max_features=100000, ngram_range=(1, 3))

In [14]:
target_class = ['-1', '0', '1']

## Multinomial NB

In [15]:
multinomialNBClassifier = MultinomialNB();

checker_pipeline = Pipeline([
            ('vectorizer', tvec),
            ('classifier', multinomialNBClassifier)
        ])

sentiment_fit = checker_pipeline.fit(x_train, y_train)


In [16]:
y_pred = sentiment_fit.predict(x_validation)
print(accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_class))

0.6028368794326241
              precision    recall  f1-score   support

          -1       0.58      1.00      0.73       154
           0       1.00      0.12      0.22        72
           1       1.00      0.12      0.22        56

    accuracy                           0.60       282
   macro avg       0.86      0.42      0.39       282
weighted avg       0.77      0.60      0.50       282



In [17]:
y_pred_test = sentiment_fit.predict(x_test)
print(accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=target_class))

0.4734982332155477
              precision    recall  f1-score   support

          -1       0.45      0.99      0.62       121
           0       0.91      0.10      0.18       101
           1       0.80      0.07      0.12        61

    accuracy                           0.47       283
   macro avg       0.72      0.39      0.31       283
weighted avg       0.69      0.47      0.35       283



## Linear SVC feature selection with model

In [18]:
from sklearn.feature_selection import SelectFromModel

clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))])

checker_pipeline = Pipeline([
            ('vectorizer', tvec),
            ('classifier', clf)
        ])

sentiment_fit = checker_pipeline.fit(x_train, y_train)

In [19]:
y_pred = sentiment_fit.predict(x_validation)
print(accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_class))

0.6170212765957447
              precision    recall  f1-score   support

          -1       0.69      0.75      0.72       154
           0       0.47      0.43      0.45        72
           1       0.55      0.48      0.51        56

    accuracy                           0.62       282
   macro avg       0.57      0.56      0.56       282
weighted avg       0.61      0.62      0.61       282



In [20]:
y_pred_test = sentiment_fit.predict(x_test)
print(accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=target_class))

0.6325088339222615
              precision    recall  f1-score   support

          -1       0.64      0.81      0.72       121
           0       0.62      0.45      0.52       101
           1       0.61      0.59      0.60        61

    accuracy                           0.63       283
   macro avg       0.63      0.62      0.61       283
weighted avg       0.63      0.63      0.62       283



In [21]:
data_romney_test = pd.read_excel('test.xlsx', 'Romney')

In [22]:
data_romney_test['Class'] = data_romney_test['Class'].apply(str)
# print(data_obama[data_obama['Class']=='irrevelant'].count())
# print(data_obama[data_obama['Class']=='irrelevant'].count())
# print(data_obama[data_obama['Class']=='2'].count())
# print(data_obama[data_obama['Class']=='1'].count())
# print(data_obama[data_obama['Class']=='0'].count())
# print(data_obama[data_obama['Class']=='-1'].count())

data_romney_test = data_romney_test[(data_romney_test['Class'] == '1') | (data_romney_test['Class'] == '0') | (data_romney_test['Class'] == '-1')]



In [23]:
data_romney_test.count()

Anootated tweet    1900
Class              1900
dtype: int64

In [24]:
data_romney_test['Class'] = data_romney_test['Class'].apply(int)
data_romney_test = data_romney_test.dropna()

In [25]:
data_romney_test.count()

Anootated tweet    1900
Class              1900
dtype: int64

In [26]:
data_romney_test['Anootated tweet'] = data_romney_test.apply(preprocess_tweet, axis=1)

In [27]:
y_pred_test = sentiment_fit.predict(data_romney_test['Anootated tweet'])
print(accuracy_score(data_romney_test['Class'], y_pred_test))
print(classification_report(data_romney_test['Class'], y_pred_test, target_names=target_class))

0.6157894736842106
              precision    recall  f1-score   support

          -1       0.65      0.79      0.72       960
           0       0.52      0.39      0.45       555
           1       0.61      0.50      0.55       385

    accuracy                           0.62      1900
   macro avg       0.59      0.56      0.57      1900
weighted avg       0.60      0.62      0.60      1900



## Simple LinearSVC

In [28]:

clf = LinearSVC()

checker_pipeline = Pipeline([
            ('vectorizer', tvec),
            ('classifier', clf)
        ])
sentiment_fit = checker_pipeline.fit(x_train, y_train)


In [29]:
y_pred = sentiment_fit.predict(x_validation)
print(accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_class))

0.6276595744680851
              precision    recall  f1-score   support

          -1       0.69      0.79      0.74       154
           0       0.47      0.39      0.43        72
           1       0.59      0.48      0.53        56

    accuracy                           0.63       282
   macro avg       0.58      0.55      0.56       282
weighted avg       0.61      0.63      0.62       282



In [30]:
y_pred_test = sentiment_fit.predict(x_test)
print(accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=target_class))

0.6219081272084805
              precision    recall  f1-score   support

          -1       0.62      0.83      0.71       121
           0       0.65      0.45      0.53       101
           1       0.60      0.51      0.55        61

    accuracy                           0.62       283
   macro avg       0.62      0.59      0.59       283
weighted avg       0.63      0.62      0.61       283



## Ridge Classifier

In [31]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier()

checker_pipeline = Pipeline([
            ('vectorizer', tvec),
            ('classifier', clf)
        ])

sentiment_fit = checker_pipeline.fit(x_train, y_train)

In [32]:
y_pred = sentiment_fit.predict(x_validation)
print(accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_class))

0.624113475177305
              precision    recall  f1-score   support

          -1       0.66      0.82      0.74       154
           0       0.46      0.35      0.40        72
           1       0.65      0.43      0.52        56

    accuracy                           0.62       282
   macro avg       0.59      0.53      0.55       282
weighted avg       0.61      0.62      0.61       282



In [33]:
y_pred_test = sentiment_fit.predict(x_test)
print(accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=target_class))

0.6148409893992933
              precision    recall  f1-score   support

          -1       0.60      0.84      0.70       121
           0       0.64      0.41      0.50       101
           1       0.65      0.51      0.57        61

    accuracy                           0.61       283
   macro avg       0.63      0.59      0.59       283
weighted avg       0.62      0.61      0.60       283

