In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [63]:
#importing data

data = pd.read_csv('cleaned_data.csv')
data.columns = ['Number','Sentiment','Tweet']

train=data.sample(frac=0.8,random_state=50) #random state is a seed value
test=data.drop(train.index)

In [64]:
# assigning data to respective lists

Train_text = train['Tweet']
Train_values = train['Sentiment']
Test_text = test['Tweet']
Test_values = test['Sentiment']

print(Train_text[4])

noth special , headach .. plus , have a sore throat ca n't breath my nose .. ll fail i n't studi


In [65]:
#CV with ngram range

cv = CountVectorizer(binary=True, ngram_range= (1,3))
cv.fit(Train_text.values.astype('U'))
Train_text_vect = cv.transform(Train_text.values.astype('U'))
Test_text_vect = cv.transform(Test_text.values.astype('U'))

In [66]:
svm = LinearSVC(C=0.01)
svm.fit(Train_text_vect, Train_values)
score = accuracy_score(Test_values, svm.predict(Test_text_vect))

print(score)

0.7746875


In [67]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), svm.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('ca wait', 0.7344797616525091)
('cant wait', 0.6007305896939183)
('thank', 0.5802208534260184)
('welcom', 0.5375725516292578)
('smile', 0.47907670988746487)
('sad', -0.9327505278332024)
('miss', -0.7610039803119514)
('poor', -0.715917073678386)
('sick', -0.6585724304962519)
('hurt', -0.6398541809019486)


In [86]:
#######################################################################
###################### USING NLTK LIBRARY #############################
#######################################################################
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pandas as pd

def processText(text):

    # make a list of words split with spaces
    textList = text.split()

    #convert all words to lower case
    for i in range(len(textList)):
        textList[i] = textList[i].lower()

    #remove usernames, email addresses, websites and hastags
    for t in textList:
        if (deepClean(t)):
            textList.remove(t)
    text = ' '.join(textList)

    #tokenize by words
    words = word_tokenize(text)

    #For each word in the list check if it is a stop word
    for w in words:
        if w in stopwords.words('english'):
            words.remove(w)

    #instance of a stemmer
    stemmer = SnowballStemmer("english")
    stemWords = []


    #iterate over all tokenized words to stem and store in list
    for t in words:
        stemWords.append(stemmer.stem(t))

        #return the list of tokenized stem words of the text passed to the method
    return stemWords


# This method takes a single string word checks it against
# a list of substrings to determine whether the word has 
# any useful information regarding the sentiment of the text.
# returns True if word is an email address, username, hashtag or 
# website else returns False.
def deepClean(word):

    # List of substrings to check if present in the word:
    # '@',".edu", ".com" may represent usernames and email addresses 
    # '#' represents hastags for social media 
    # 'www.', "https://", ".edu", ".com" may represent websites
    cleaners = ["@", "#", "www.", "Www.", "https://", "Https://" ,  ".edu", ".com"]

    #check each subtring if contained in word
    for c in cleaners:
        #if word contains substring return True
        if c in word:
            return True

    #else return False
    return False

In [88]:
inputp = 'HELLo'
inputpp = processText(inputp)
print(inputpp)
string_text = ""
for i in range(len(inputpp)):
    string_text = string_text + inputpp[i] + " "
print(string_text)
input_text = [string_text]
input_text_vect = cv.transform(input_text)
pos_prob = svm.predict(input_text_vect)
print(pos_prob)

['hello']
hello 
[1]
