In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os

In [3]:
#importing data

data = pd.read_csv('cleaned_data.csv')
data.columns = ['Number','Sentiment','Tweet']

train=data.sample(frac=0.8,random_state=50) #random state is a seed value
test=data.drop(train.index)

In [4]:
# assigning data to respective lists

Train_text = train['Tweet']
Train_values = train['Sentiment']
Test_text = test['Tweet']
Test_values = test['Sentiment']

print(Train_text[4])

noth special , just a headach .. plu , i have a sore throat and ca n't breath through my nose .. i 'll fail if i do n't studi


In [6]:
#CV with ngram range

cv = CountVectorizer(binary=True, ngram_range= (1,2))
cv.fit(Train_text.values.astype('U'))
Train_text_vect = cv.transform(Train_text.values.astype('U'))
Test_text_vect = cv.transform(Test_text.values.astype('U'))

# tfidf = TfidfVectorizer(ngram_range = (1,3),min_df = 2)
# tfidf.fit(Train_text.values.astype('U'))
# Train_text_vect = tfidf.transform(Train_text.values.astype('U'))
# Test_text_vect = tfidf.transform(Test_text.values.astype('U'))

In [7]:
svm = LinearSVC(C=0.01)
svm.fit(Train_text_vect, Train_values)
score = accuracy_score(Test_values, svm.predict(Test_text_vect))

print(score)

0.8068625


In [8]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), svm.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('no problem', 0.8830362173731378)
('ca wait', 0.7997943353310095)
('no worri', 0.7252975605831302)
('cant wait', 0.6985937852584186)
('not bad', 0.6809375654726316)
('sad', -0.9193493174458705)
('poor', -0.8277723019133335)
('miss', -0.7764348137261518)
('headach', -0.7625755107776224)
('not happi', -0.7601244472698924)


In [9]:
#######################################################################
###################### USING NLTK LIBRARY #############################
#######################################################################
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import pandas as pd

def processText(text):

    # make a list of words split with spaces
    textList = text.split()

    #convert all words to lower case
    for i in range(len(textList)):
        textList[i] = textList[i].lower()

    #removes Stop words
    for t in textList:
        if (removeStopWords(t)):
            textList.remove(t)
    text = ' '.join(textList)

    #tokenize by words
    words = word_tokenize(text)

    #instance of a stemmer
    stemmer = SnowballStemmer("english")
    stemWords = []


    #iterate over all tokenized words to stem and store in list
    for t in words:
        stemWords.append(stemmer.stem(t))

        #return the list of tokenized stem words of the text passed to the method
    return stemWords


# This method takes a single string word checks it against
# a list of substrings to determine whether the word has 
# any useful information regarding the sentiment of the text.
# returns True if word is an email address, username, hashtag or 
# website else returns False.
def removeStopWords(word):

    # List of substrings to check if present in the word:
    # '@',".edu", ".com" may represent usernames and email addresses 
    # '#' represents hastags for social media 
    # 'www.', "https://", ".edu", ".com" may represent websites
    # increased the list by adding stop words from the nltk stop words function
    # The nltk stopwords had negations such as not, nor, won't which are not
    # considered stopwords in this method
    cleaners = ["@", "#", "www.", "Www.", "https://", "Https://" ,  ".edu", ".com", 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such','only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just','ll', 'm', 'o', 're', 've', 'y', 'ma']

    #check each subtring if contained in word
    for c in cleaners:
        #if word contains substring return True
        if c in word:
            return True

    #else return False
    return False

In [11]:
inputp = 'I do not feel happy'
inputpp = processText(inputp)
print(inputpp)
string_text = ""
for i in range(len(inputpp)):
    string_text = string_text + inputpp[i] + " "
print(string_text)
input_text = ['I feel very sad right now']
input_text_vect = cv.transform(input_text)
pos_prob = svm.predict(input_text_vect)
print(pos_prob)

['do', 'feel']
do feel 
[0]


In [12]:
if not os.path.exists("pickled_models"):
    os.makedirs("pickled_models")
    
save_svm = open("pickled_models/svm_model.pickle", "wb")
pickle.dump(svm, save_svm)
save_svm.close()