In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import *
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#assign column names to dataset
names = ['sentiment', 'id', 'date', 'flag', 'user', 'text']
print(type(names))

<class 'list'>


In [3]:
#access dataset 

#twitter_df = pd.read_csv(r"C:/Users/user/Documents/IRWA_Assignment3/twitter.csv", encoding = "latin-1")
twitter_df = pd.read_csv(r"C:/Users/user/Documents/IRWA_Assignment3/twitter_edit_1.csv", encoding = "latin-1")
twitter_df = pd.DataFrame(twitter_df.values, columns = names )
twitter_df.head()

Unnamed: 0,sentiment,id,date,flag,user,text
0,0,1468487862,Tue Apr 07 01:58:48 PDT 2009,NO_QUERY,joannladybird,I don't want to be a grown up yet...
1,0,1468487909,Tue Apr 07 01:58:50 PDT 2009,NO_QUERY,sambatia,as usual ... cooking what I hate
2,0,1468488036,Tue Apr 07 01:58:53 PDT 2009,NO_QUERY,sophiehe,"incredibly, immensely indecisive"
3,0,1468488056,Tue Apr 07 01:58:54 PDT 2009,NO_QUERY,katiecometrue,so my wish didn't come true go to hell caroli...
4,0,1468488504,Tue Apr 07 01:59:04 PDT 2009,NO_QUERY,TheMightyFoz,yay! 2am and not a bit tired.


In [4]:
#check the null values count in the twitter_df, now you can see that
twitter_df.isnull().any()

sentiment    False
id           False
date         False
flag         False
user         False
text         False
dtype: bool

In [5]:
#drop the flag column
#twitter_df.drop(['flag'], axis=1, inplace=True)
#twitter_df.head()

#drop some columns
twitter_df = twitter_df.drop(['id', 'date', 'flag', 'user'], axis = 1)
twitter_df.head()

Unnamed: 0,sentiment,text
0,0,I don't want to be a grown up yet...
1,0,as usual ... cooking what I hate
2,0,"incredibly, immensely indecisive"
3,0,so my wish didn't come true go to hell caroli...
4,0,yay! 2am and not a bit tired.


In [6]:
#observe the content of the sentiment column
pd.unique(twitter_df['sentiment'].values) 

array([0, 4], dtype=object)

In [7]:
#Extract the tweet from the dataset to process further
tweetText = twitter_df['text']

tweetText.head()

0                I don't want to be a grown up yet... 
1                    as usual ... cooking what I hate 
2                    incredibly, immensely indecisive 
3    so my wish didn't come true  go to hell caroli...
4                       yay! 2am and not a bit tired. 
Name: text, dtype: object

In [8]:



#remove links and website addresses 
tweetText = tweetText.str.replace(r'http\S+', ' ', regex=True).replace(r'www\S+', ' ', regex=True)

#remove all usernames
tweetText = tweetText.str.replace(r'@[^\s]+',' ')

#remove html tags
tweetText =tweetText.str.replace(r'<[^>]+>',' ')

#remove special characters
spec_chars = ["!",'"',"#","%","&","'","(",")",
               "*","+",",","-",".","/",":",";","<",
               "=",">","?","@","[","\\","]","^","_",
               "`","{","|","}","~","–"]
    
for char in spec_chars:
     tweetText = tweetText.str.replace(char, ' ')
        
#remove all numbers
tweetText = tweetText.str.replace(r'[0-9]',' ')

#remove single characters in the tweet
tweetText = tweetText.str.replace(r'[ ][a-z][ ]',' ')
        
#remove multiple spaces
tweetText = tweetText.str.replace(r'[ ]{2,}', ' ')

#turn the all the letters in the tweet in to lowercase
tweetText = tweetText.str.lower()


tweetText.head()



0                    i don want to be grown up yet 
1                     as usual cooking what i hate 
2                  incredibly immensely indecisive 
3    so my wish didn come true go to hell carolina 
4                         yay am and not bit tired 
Name: text, dtype: object

In [9]:
#tokenize the words


tweetText = tweetText.apply(word_tokenize)
tweetText.head()

0               [i, don, want, to, be, grown, up, yet]
1                  [as, usual, cooking, what, i, hate]
2                  [incredibly, immensely, indecisive]
3    [so, my, wish, didn, come, true, go, to, hell,...
4                      [yay, am, and, not, bit, tired]
Name: text, dtype: object

In [10]:

#remove stop words from the tweet

stop_words = set(stopwords.words('english'))
#tweetText = tweetText.apply([word for word in tweetText if not word in stopwords.words()])
#tweetText.head()
#lemmatize the words
#tweetText = tweetText.apply([])

tweetText = tweetText.apply(lambda x:[word for word in x if word not in stop_words])
tweetText.head()

0                        [want, grown, yet]
1                    [usual, cooking, hate]
2       [incredibly, immensely, indecisive]
3    [wish, come, true, go, hell, carolina]
4                         [yay, bit, tired]
Name: text, dtype: object

In [11]:
 

lemmatizer= WordNetLemmatizer()
tweetText = tweetText.apply(lambda x:[lemmatizer.lemmatize(word) for word in x])
tweetText.head()



0                        [want, grown, yet]
1                    [usual, cooking, hate]
2       [incredibly, immensely, indecisive]
3    [wish, come, true, go, hell, carolina]
4                         [yay, bit, tired]
Name: text, dtype: object

In [12]:


#stemming the words
stemmer=PorterStemmer()
tweetText = tweetText.apply(lambda x:[stemmer.stem(word) for word in x])

tweetText.head()

0                        [want, grown, yet]
1                       [usual, cook, hate]
2                 [incred, immens, indecis]
3    [wish, come, true, go, hell, carolina]
4                          [yay, bit, tire]
Name: text, dtype: object

In [13]:
#Remove lists
tweet = tweetText.str.join(' ')
tweet.head()

0                     want grown yet
1                    usual cook hate
2              incred immens indecis
3    wish come true go hell carolina
4                       yay bit tire
Name: text, dtype: object

In [14]:
# Initialize empty array 
# to append clean text  
#tweet_corpus = []  
  
#for text in tweet:  
     # append each string to create array of clean text  
#    tweet_corpus.append(text) 

In [15]:


#create the object
countvec = CountVectorizer()

#tweetVector = countvec.fit_transform(word for word in tweet)
tweetVector = countvec.fit_transform(word for word in tweet)

#print the first 5 rows including the 5th index
print(tweetVector[0:5,:])

print(" ")

#to get a relevant word
print(countvec.get_feature_names()[3102])



  (0, 3102)	1
  (0, 1217)	1
  (0, 3247)	1
  (1, 3041)	1
  (1, 633)	1
  (1, 1277)	1
  (2, 1443)	1
  (2, 1432)	1
  (2, 1445)	1
  (3, 3165)	1
  (3, 595)	1
  (3, 2961)	1
  (3, 1163)	1
  (3, 1309)	1
  (3, 474)	1
  (4, 3231)	1
  (4, 304)	1
  (4, 2898)	1
 
want


In [16]:


#create the object
tfidf_transformer = TfidfTransformer()

#input the vectorized tweets and convert into tf-idf weights
tfidf = tfidf_transformer.fit_transform(tweetVector)

#print the tf-idf values 
print(tfidf)

print(" ")

#get tf-idf values for a word
print(tfidf_transformer.idf_[countvec.vocabulary_['want']])


  (0, 3247)	0.5534387945242979
  (0, 3102)	0.43348708372026706
  (0, 1217)	0.7111922728511538
  (1, 3041)	0.6382620386584756
  (1, 1277)	0.5092736802526749
  (1, 633)	0.5772883929280259
  (2, 1445)	0.5875290351720356
  (2, 1443)	0.5564344217063087
  (2, 1432)	0.5875290351720356
  (3, 3165)	0.3538908713934089
  (3, 2961)	0.44833079178894164
  (3, 1309)	0.4718701519948293
  (3, 1163)	0.24612242351088842
  (3, 595)	0.3213343868792073
  (3, 474)	0.5359732049167961
  (4, 3231)	0.5703547363066376
  (4, 2898)	0.5294735827620132
  (4, 304)	0.6279754771722724
  (5, 2861)	0.21207519995361998
  (5, 2375)	0.34552604960264
  (5, 2259)	0.37989718492267777
  (5, 2251)	0.3344610154899445
  (5, 2157)	0.5820981780263352
  (5, 1974)	0.2408867673586433
  (5, 1958)	0.23796903957252846
  :	:
  (1557, 1690)	0.2877003669562403
  (1557, 1427)	0.38058778696267753
  (1557, 1173)	0.40909454380333765
  (1557, 1112)	0.46466959783176154
  (1557, 1023)	0.3886867604668838
  (1558, 1987)	0.7071067811865475
  (1558, 118

In [17]:
#names = tfidf.get_feature_names()
#print(names)

In [18]:
x=tfidf
print(x)

  (0, 3247)	0.5534387945242979
  (0, 3102)	0.43348708372026706
  (0, 1217)	0.7111922728511538
  (1, 3041)	0.6382620386584756
  (1, 1277)	0.5092736802526749
  (1, 633)	0.5772883929280259
  (2, 1445)	0.5875290351720356
  (2, 1443)	0.5564344217063087
  (2, 1432)	0.5875290351720356
  (3, 3165)	0.3538908713934089
  (3, 2961)	0.44833079178894164
  (3, 1309)	0.4718701519948293
  (3, 1163)	0.24612242351088842
  (3, 595)	0.3213343868792073
  (3, 474)	0.5359732049167961
  (4, 3231)	0.5703547363066376
  (4, 2898)	0.5294735827620132
  (4, 304)	0.6279754771722724
  (5, 2861)	0.21207519995361998
  (5, 2375)	0.34552604960264
  (5, 2259)	0.37989718492267777
  (5, 2251)	0.3344610154899445
  (5, 2157)	0.5820981780263352
  (5, 1974)	0.2408867673586433
  (5, 1958)	0.23796903957252846
  :	:
  (1557, 1690)	0.2877003669562403
  (1557, 1427)	0.38058778696267753
  (1557, 1173)	0.40909454380333765
  (1557, 1112)	0.46466959783176154
  (1557, 1023)	0.3886867604668838
  (1558, 1987)	0.7071067811865475
  (1558, 118

In [19]:
#twitter_df['sentiment'] =twitter_df['sentiment'].replace({0:"Negative"})
#twitter_df['sentiment'] =twitter_df['sentiment'].replace({4:"Positive"})

#y = twitter_df.iloc[:, 0].values 

In [20]:
#dealing with catergorical data
def getText(number):
    if number==0:
        return "Negative"
    else:
        return "Positive"
twitter_df["sentiment"]=twitter_df["sentiment"].apply(getText)

In [21]:
pd.unique(twitter_df['sentiment'].values)

array(['Negative', 'Positive'], dtype=object)

In [22]:

y = twitter_df.iloc[:, 0].values

print(y)

['Negative' 'Negative' 'Negative' ... 'Positive' 'Positive' 'Positive']


In [23]:


#80% to the trainig test and 20% to the testing test
#x_train, x_test, y_train, y_test = train_test_split(tfidf,twitter_df['sentiment'], test_size=0.20)

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20)

#Exact y
expected_output = y_test

print(expected_output)

['Positive' 'Positive' 'Positive' 'Positive' 'Negative' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Negative' 'Positive' 'Positive' 'Negative' 'Negative' 'Positive'
 'Negative' 'Positive' 'Negative' 'Negative' 'Positive' 'Negative'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Negative' 'Negative' 'Positive' 'Positive' 'Negative' 'Negative'
 'Negative' 'Negative' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Negative' 'Negative' 'Positive' 'Positive'
 'Negative' 'Positive' 'Positive' 'Negative' 'Positive' 'Positive'
 'Positive' 'Negative' 'Negative' 'Negative' 'Positive' 'Negative'
 'Positive' 'Negative' 'Positive' 'Negative' 'Negative' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Negative' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Negative' 'Positive' 'Negative' 'Positive'
 'Positive' 'Negative' 'Negative' 'Positive' 'Negative' 'Negat

In [24]:
#Import Gaussian Naive Bayes model


#Create a Gaussian Classifier
#gnb = GaussianNB()

#Train the model using the training sets
#gnb.fit(x_train, y_train)

#Predict the response for test dataset
#y_pred = gnb.predict(x_test)

In [25]:
#Import scikit-learn metrics module for accuracy calculation


# Model Accuracy, how often is the classifier correct?
#print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [26]:

#Creating Naive Bayes Model
NV_model = MultinomialNB()

#Train the model using the training sets
NV_model.fit(x_train, y_train)

#Predict the response for test dataset
test_prediction = NV_model.predict(x_test)

print('Accuracy of the Naive Bayes model: ')
print(NV_model.score(x_test,y_test))


print('\nClassification Report')
print(classification_report(test_prediction, expected_output))

print('\nConfusion Matrix')
print(confusion_matrix(test_prediction, expected_output))

Accuracy of the Naive Bayes model: 
0.7092651757188498

Classification Report
              precision    recall  f1-score   support

    Negative       0.21      0.82      0.34        28
    Positive       0.98      0.70      0.81       285

    accuracy                           0.71       313
   macro avg       0.59      0.76      0.57       313
weighted avg       0.91      0.71      0.77       313


Confusion Matrix
[[ 23   5]
 [ 86 199]]


In [27]:
from sklearn.neighbors import KNeighborsClassifier
#Building K-NN model 

#Creating K-NN model and set initial neighbors to 5
#After running several times with random k value we found that the 13 is most accurate
model = KNeighborsClassifier(n_neighbors=13)

#fit the data into model train
model.fit(x_train, y_train)

knn_predicted_output = model.predict(x_test)

print('Accuracy of the model: ',model.score(x_test,y_test))
print('\nConfusion Matrix: ',confusion_matrix(knn_predicted_output,y_test))
print('\nReport: ', classification_report(knn_predicted_output,y_test))

Accuracy of the model:  0.6869009584664537

Confusion Matrix:  [[ 25  14]
 [ 84 190]]

Report:                precision    recall  f1-score   support

    Negative       0.23      0.64      0.34        39
    Positive       0.93      0.69      0.79       274

    accuracy                           0.69       313
   macro avg       0.58      0.67      0.57       313
weighted avg       0.84      0.69      0.74       313



In [28]:
#17.	Build SVM model using Train Data

#from sklearn.svm import SVC








In [29]:
#svclassifier = SVC(kernel='linear')
svmodel = SVC(kernel = 'rbf', gamma = 'scale', C = 1)

svmodel.fit(x_train, y_train)

#Predict the model using test dataset 
svm_predicted_output = svmodel.predict(x_test)



In [30]:
print('Accuracy of the model: ',svmodel.score(x_test, expected_output))
print('\nConfusion Matrix: ',confusion_matrix(svm_predicted_output, expected_output))
print('\nReport: ', classification_report(svm_predicted_output, expected_output))

Accuracy of the model:  0.7156549520766773

Confusion Matrix:  [[ 26   6]
 [ 83 198]]

Report:                precision    recall  f1-score   support

    Negative       0.24      0.81      0.37        32
    Positive       0.97      0.70      0.82       281

    accuracy                           0.72       313
   macro avg       0.60      0.76      0.59       313
weighted avg       0.90      0.72      0.77       313

