# Sentiment classification for social media - He Tianyou

In [1]:
# Import packages
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import re

Classifier part one

In [2]:
# Read training data
train_data = pd.read_csv('semeval-tweets/twitter-training-data.txt', sep='\t', names=['tweet_id','sentiment','tweet_text'])
# Read dev data
dev_data = pd.read_csv('semeval-tweets/twitter-dev-data.txt', sep='\t', names=['tweet_id','sentiment','tweet_text'])

Text preprocessing

In [3]:
# Lowercase
train_data['tweet_text'] = train_data['tweet_text'].str.lower()
dev_data['tweet_text'] = dev_data['tweet_text'].str.lower()
# Remove URLs. Note that URLs may appear in different forms
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'(((https?:\/\/)|(w{3}.))[\S]*)|([\w\d\/\.]*\.(com|cn|co|net|org|edu|uk|int|js|html))', '')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'(((https?:\/\/)|(w{3}.))[\S]*)|([\w\d\/\.]*\.(com|cn|co|net|org|edu|uk|int|js|html))', '')
# remove twitter handles
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'\@[\S]*', '')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'\@[\S]*', '')
# Remove numbers that are fully made of digits
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'\b\d+\b','')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'\b\d+\b','')
# Remove words with only 1 character. 
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'\b\w\b','')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'\b\w\b','')
train_data.head()

Unnamed: 0,tweet_id,sentiment,tweet_text
0,335104872099066692,positive,felt privileged to play foo fighters songs on ...
1,796528524030124618,positive,"pakistan may be an islamic country, but der a..."
2,760964834217238632,positive,happy birthday to the coolest golfer in bali! ...
3,147713180324524046,negative,tmills is going to tucson! but the 29th and i...
4,732302280474120023,negative,hmmmmm where are the #blacklivesmatter when ma...


In [4]:
# TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [5]:
# sentiment becomes dependent variable
train_y = train_data.sentiment
dev_y = dev_data.sentiment

In [6]:
train_X = vectorizer.fit_transform(train_data.tweet_text)
dev_X = vectorizer.transform(dev_data.tweet_text)

In [7]:
print(train_y.shape)
print(train_X.shape)
print(dev_y.shape)
print(dev_X.shape)

(45026,)
(45026, 41885)
(2000,)
(2000, 41885)


In [8]:
# train MaxEnt classifier (Logisitic regression)
clf = linear_model.LogisticRegression()
clf.fit(train_X, train_y)

# test model accuracy
pred_y_mxe = clf.predict(dev_X)
acc_score_mxe = accuracy_score(dev_y, pred_y_mxe)
conf_mat_mxe = confusion_matrix(dev_y, pred_y_mxe, labels = ['positive', 'negative', 'neutral'])

print(acc_score_mxe)
print(conf_mat_mxe)



0.655
[[432  12 259]
 [ 38 152 188]
 [138  55 726]]


In [9]:
# train naive bayes classifier
clf2 = naive_bayes.MultinomialNB()
clf2.fit(train_X, train_y)

# test model accuracy
pred_y_nb = clf2.predict(dev_X)
acc_score_nb = accuracy_score(dev_y, pred_y_nb)
conf_mat_nb = confusion_matrix(dev_y, pred_y_nb, labels = ['positive', 'negative', 'neutral'])

print(acc_score_nb)
print(conf_mat_nb)

0.5975
[[390   1 312]
 [ 43  41 294]
 [145  10 764]]


In [None]:
# train svm
clf3 = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf3.fit(train_X, train_y)

# test model accuracy
pred_y_svm = clf3.predict(dev_X)
acc_score_svm = accuracy_score(dev_y, pred_y_svm)
conf_mat_svm = confusion_matrix(dev_y, pred_y_svm, labels = ['positive', 'negative', 'neutral'])

print(acc_score_svm)
print(conf_mat_svm)