# Sentiment Analysis

In [1]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
scikit_version = sklearn.__version__
scikit_version

'0.24.0'

In [4]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv', 
                             header = None, 
                             names = ['Label', 'Text'],
                             sep = '\t')

sentiment_data.sample(10)

Unnamed: 0,Label,Text
4159,0,Da Vinci Code sucked..
2502,1,"I want to be here because I love Harry Potter,..."
5266,0,"I hate Harry Potter, that daniel wotshisface n..."
1087,1,Mission Impossible 3 was AWESOME...
4878,0,Da Vinci Code sucks be...
2090,1,I was so proud of myself that I was given the ...
5098,0,The actor who plays Harry Potter sucks so does...
6529,0,"Then snuck into Brokeback Mountain, which is t..."
6139,0,"Then snuck into Brokeback Mountain, which is t..."
46,1,looks amazingly fun and possibly a bit cold......


In [5]:
X = sentiment_data['Text']
Y = sentiment_data['Label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5534,), (1384,), (5534,), (1384,))

In [8]:
tfidf_vect = TfidfVectorizer(max_features = 15)

X_trans = tfidf_vect.fit_transform(X_train)

In [9]:
print(X_trans[:3])

  (0, 3)	0.5131713691583782
  (0, 13)	0.5133148269059888
  (0, 4)	0.5133148269059888
  (0, 12)	0.45789837609786865
  (1, 10)	0.5605145664390224
  (1, 2)	0.5605145664390224
  (1, 8)	0.609628445546424
  (2, 1)	0.5184222846821045
  (2, 7)	0.46822178454612806
  (2, 3)	0.41304394309868475
  (2, 13)	0.41315941008944523
  (2, 4)	0.41315941008944523


In [10]:
X_trans.shape

(5534, 15)

In [11]:
classifier = LinearSVC(C = 1.0, max_iter = 1000, tol = 1e-3)
linear_svc_model = classifier.fit(X_trans, y_train)
linear_svc_model

LinearSVC(tol=0.001)

In [13]:
X_test_trans = tfidf_vect.fit_transform(X_test)
X_test_trans.shape

(1384, 15)

In [14]:
y_pred = linear_svc_model.predict(X_test_trans)

In [15]:
y_pred

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [16]:
pred_results = pd.DataFrame({'y_test':y_test,
                            'y_pred': y_pred})
pred_results.sample(5)

Unnamed: 0,y_test,y_pred
1941,1,1
5913,0,0
5155,0,0
371,1,1
247,1,1


In [17]:
from sklearn.metrics import accuracy_score

In [18]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8858381502890174

In [20]:
text_clf_param = {}

text_clf_param['preprocessing'] = tfidf_vect
text_clf_param['model'] = linear_svc_model
text_clf_param['sklearn_version'] = scikit_version
text_clf_param['accuracy'] = accuracy

In [21]:
text_clf_param

{'preprocessing': TfidfVectorizer(max_features=15),
 'model': LinearSVC(tol=0.001),
 'sklearn_version': '0.24.0',
 'accuracy': 0.8858381502890174}

## Save model checkpoint

In [22]:
import joblib

In [23]:
filename = 'models/text_clf_chkpt.joblib'

joblib.dump(text_clf_param, filename)

['models/text_clf_chkpt.joblib']

In [24]:
clf_chkpt = joblib.load(filename)

In [25]:
reloaded_vect = clf_chkpt['preprocessing']
reloaded_vect

TfidfVectorizer(max_features=15)

In [26]:
clf_model = clf_chkpt['model']
clf_model

LinearSVC(tol=0.001)

In [27]:
X_test_trans_new = reloaded_vect.fit_transform(X_test)

In [28]:
y_pred = clf_model.predict(X_test_trans_new)

accuracy_score(y_test, y_pred)

0.8858381502890174

In [29]:
clf_chkpt['accuracy']

0.8858381502890174

## Make pipeline model

In [30]:
from sklearn.pipeline import Pipeline

In [31]:
clf_pipeline = Pipeline(steps = [('tfidf_vect', tfidf_vect), ('classifier', classifier)])
pipeline_model = clf_pipeline.fit(X_train, y_train)

In [33]:
y_pred = pipeline_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

In [34]:
accuracy

0.8858381502890174

In [36]:
pipe_clf_param = {}

pipe_clf_param['pipeline_clf'] = pipeline_model
pipe_clf_param['sklearn_version'] = scikit_version
pipe_clf_param['accuracy'] = accuracy

In [37]:
filename = 'models/pipe_clf_chkpt.joblib'
joblib.dump(pipe_clf_param, filename)

['models/pipe_clf_chkpt.joblib']

In [40]:
pipe_clf_chkpt = joblib.load(filename)

In [41]:
reloaded_pipe = pipe_clf_chkpt['pipeline_clf']
reloaded_pipe

Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                ('classifier', LinearSVC(tol=0.001))])

In [42]:
y_pred = reloaded_pipe.predict(X_test)
accuracy_score(y_test, y_pred)

0.8858381502890174