##### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2022 Semester 1

## Assignment 2: Sentiment Classification of Tweets

This is a sample code to assist you with vectorising the 'Train' dataset for your assignment 2.

First we read the CSV datafiles (Train and Test).

In [2]:
# import libraries
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import scipy
import time

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report


In [3]:
# read data from csv file
train_data = pd.read_csv("Train.csv", sep=',')
test_data = pd.read_csv("Test.csv", sep=',')

x = train_data.text
y = train_data.sentiment
print(y.value_counts())


neutral     12659
positive     5428
negative     3715
Name: sentiment, dtype: int64


### Text Preprocessing

In [4]:
#separating instance and label for Train
X_train_raw = [x[0] for x in train_data[['text']].values]
Y_train = [x[0] for x in train_data[['sentiment']].values]

print(X_train_raw[3])

print(len(Y_train))
print(len(X_train_raw))

# data cleaning
#X_train_raw = train_data['text'].apply(lambda x: re.sub(r'http\S+', '', x))
#X_train_raw = X_train_raw.apply(lambda x: re.sub(r'@[A-Za-z0-9]+', '', x))
#X_train_raw = X_train_raw.apply(lambda x: re.sub(r'RT[\s]]+', '', x))

#X_train_raw = X_train_raw.apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))


#check the result
print("Train length:",len(X_train_raw))
print(X_train_raw[3])

#separating instance and label for Test
X_test_raw = [x[0] for x in test_data[['text']]]

#check the result
print("Test length:",len(X_test_raw))

 "prince george reservist who died saturday just wanted to help people, his father tells @cbcnews http://t.co/riauzrjgre"	
21802
21802
Train length: 21802
 "prince george reservist who died saturday just wanted to help people, his father tells @cbcnews http://t.co/riauzrjgre"	
Test length: 1


In [5]:
#more info on training dataset

print(Y_train[:30])
def convert_class(raw):
    if raw == 'negative': return 0
    elif raw == 'neutral': return 1
    elif raw == 'positive': return 2


Y_train_num = []
for y in Y_train:
    Y_train_num.append(convert_class(y))


['neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'positive']


### Training test splitting

In [6]:
# hold out
X_train, X_test, y_train, y_test = train_test_split(X_train_raw, Y_train, test_size=0.3, random_state=30)

In [7]:
# cross fold validation to be added

In [8]:
# Tfidf Vectorizer
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

model_lr = LogisticRegression(random_state=0, multi_class='ovr', solver = 'liblinear')
model_lr.fit(tf_x_train,y_train)
print(model_lr.score(tf_x_test, y_test))

mnb = MultinomialNB().fit(tf_x_train,y_train)
print(mnb.score(tf_x_test, y_test))


#Build the feature set (vocabulary) and vectorise the Tarin dataset using TFIDF
#X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)

#Use the feature set (vocabulary) from Train to vectorise the Test dataset 
#X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)

0.6697752637211436
0.5966977526372115


In [9]:
# BoW + Kbest
BoW_vectorizer = CountVectorizer()

X_train_BoW = BoW_vectorizer.fit_transform(X_train)
X_test_BoW = BoW_vectorizer.transform(X_test)


kbest = SelectKBest(chi2, k=1000).fit(X_train_BoW, y_train)
kbest_train_X = kbest.transform(X_train_BoW)
kbest_test_X = kbest.transform(X_test_BoW)

#Build the feature set (vocabulary) and vectorise the Tarin dataset using BoW
#X_train_BoW = BoW_vectorizer.fit_transform(X_train_raw)

#Use the feature set (vocabulary) from Train to vectorise the Test dataset 
#X_test_BoW = BoW_vectorizer.transform(X_test_raw)


clf = LinearSVC(random_state=0)
clf.fit(kbest_train_X,y_train)

print(clf.score(kbest_test_X, y_test))

model_lr.fit(kbest_train_X,y_train)
print(model_lr.score(kbest_test_X, y_test))



0.6705396728329002
0.6777251184834123


We can save the created vocabulary for the given dataset in a separate file.

In [10]:
vocab_dict = BoW_vectorizer.vocabulary_
output_pd = pd.DataFrame(list(vocab_dict.items()),columns = ['word','count'])
print(output_pd.shape)

vocab_dict
output_pd.T.to_csv('BoW-vocab.csv',index=False)

(34424, 2)


### Feature selecting

In [11]:
# adapted from workshop 9

from sklearn.svm import SVC
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_mi = mi.fit_transform(X_train_BoW,y_train)
X_test_mi = mi.transform(X_test_BoW)

models = [#GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(max_depth=1),
          KNeighborsClassifier(n_neighbors=5),
          DecisionTreeClassifier(max_depth=None),
          LogisticRegression(max_iter = 1000),
          SVC(kernel='rbf', gamma=0.7),
          SVC(kernel='poly', degree=3)]
titles = [#'GNB',
          'MNB',
          'one-r',
          '5-nearest neighbour',
          'Decision Tree',
          'Logistic Regression',
          'SVM with a cubic kernel',
          'SVM with an RBF kernel']

k = 1000

for k in [100,1000,10000]: #10,
    print('\n--------------------------------------- K = ', k,'------------------------------')
    x2 = SelectKBest(chi2, k=k)
    x2.fit(X_train_BoW,y_train)
    X_train_x2 = x2.transform(X_train_BoW)
    X_test_x2 = x2.transform(X_test_BoW)


    mi = SelectKBest(score_func=mutual_info_classif, k=k)
    mi.fit(X_train_BoW,y_train)
    X_train_mi = mi.transform(X_train_BoW)
    X_test_mi = mi.transform(X_test_BoW)


    Xs = [(X_train_BoW, X_test_BoW), (X_train_x2, X_test_x2), (X_train_mi, X_test_mi)]
    X_names = ['complete', 'x2', 'mi']
    for title, model in zip(titles, models):
        print('\n=========',title, '(with k=',k,'features): ')
        
        for X_name, X in zip(X_names, Xs):
            X_train_t, X_test_t = X
            
            model.fit(X_train_t.todense(), y_train)
            y_test_predict = model.predict(X_test_t.todense())
            accuracy =  accuracy_score(y_test, y_test_predict)
            print(X_name, 'accuracy is:',  accuracy)


KeyboardInterrupt: 

--------------------------------------- K =  10 ------------------------------

========= GNB (with k= 10 features):
complete accuracy is: 0.4204250114661367
x2 accuracy is: 0.26968353462773276
mi accuracy is: 0.2761045711664883

========= MNB (with k= 10 features): 
complete accuracy is: 0.6584620088671457
x2 accuracy is: 0.592722825256077
mi accuracy is: 0.5982265708607246

========= one-r (with k= 10 features): 
complete accuracy is: 0.5965448708148601
x2 accuracy is: 0.5965448708148601
mi accuracy is: 0.5965448708148601

========= 5-nearest neighbour (with k= 10 features): 
complete accuracy is: 0.5863017887173215
x2 accuracy is: 0.5852316159608623
mi accuracy is: 0.5849258523161596

========= Decision Tree (with k= 10 features): 
complete accuracy is: 0.5685674973245681
x2 accuracy is: 0.6116801712276411
mi accuracy is: 0.6112215257605871

========= Logistic Regression (with k= 10 features): 
complete accuracy is: 0.6606023543800642
x2 accuracy is: 0.6144320440299649
mi accuracy is: 0.6141262803852622

========= SVM with a cubic kernel (with k= 10 features): 

### Model Adapting

In [None]:
svc = LinearSVC().fit(tf_x_train, y_train)

#cvscvore(svc, tf_x_train, y_train, cv=5). mean0, sve.score (tf_x_test, y_test)
print(len(y_train))
svc_pred = svc.predict(tf_x_test)
print(classification_report(y_test, svc_pred))
matrix_svc = confusion_matrix(y_test, svc_pred)

import seaborn as sns
sns.heatmap(matrix_svc, cmap = 'Blues', annot = True)
plt.xlabel ("Predicted classes")
plt.ylabel ("Actual classes")

In [None]:
clf = LinearSVC(random_state=0)
clf.fit(kbest_train_X,y_train)

print(clf.score(kbest_test_X, y_test))

model_lr.fit(kbest_train_X,y_train)
print(model_lr.score(kbest_test_X, y_test))


# store best combination as x_train, x_test, y_train, y_test

### Ensemble models

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier


bagging = BaggingClassifier().fit(kbest_train_X, y_train)
print(bagging.score(kbest_test_X, y_test))


#voting
base_classifiers = [('dt', DecisionTreeClassifier()),
                    (' 1r', LogisticRegression (C= 0.5)),
                    ('mnb' , MultinomialNB()),
                    (' svc', LinearSVC (C=1))]
voting = VotingClassifier (estimators=base_classifiers)

voting.fit(kbest_train_X ,y_train)
print(voting.score(kbest_test_X, y_test))


#random forest
rf = RandomForestClassifier(n_jobs=-1, n_estimators=100, verbose=1).fit(kbest_train_X, y_train)
print(rf.score(kbest_test_X, y_test))


#ada
ada = AdaBoostClassifier().fit(kbest_train_X,y_train)
print(ada.score(kbest_test_X, y_test))

### Prediction

In [None]:
# final prediction here