In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import joblib
import pickle


In [2]:
def read_data(file):
    data = []
    with open(file, 'r') as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
        f.close()
    return data

In [3]:
def convert_label(item, name): 
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()

In [5]:
file = '../public/data.txt'
data = read_data(file)

In [6]:
emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

In [7]:
tfidf = TfidfVectorizer(ngram_range = (1,2))
X = []
y = []

In [8]:
for label, text in data:
    X.append(text)
    y.append(convert_label(label, emotions))

In [9]:
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('classifier', LinearSVC(max_iter=100000, random_state=123))
])

In [10]:
model_pipeline.fit(X, y)

0,1,2
,steps,"[('tfidf', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [11]:
text = input("Enter Text : ")
print("Emotion : ", model_pipeline.predict([text]))

Emotion :  ['joy']


In [13]:
with open('../src/models/src/lsvc_pipeline.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f, protocol=4)