In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
import timeit

In [2]:
data = load_files("data/exercise02Data/txt_sentoken", encoding='utf-8', shuffle=True)

In [5]:
type(data)

sklearn.utils._bunch.Bunch

In [6]:
type(data.data)

list

In [8]:
type(data.data[5])

str

In [3]:
# Let's create a tool to help explore our data
def visualize_data(data_point):
    val_dict = { 0:"negative", 1:"positive"}
    print(f'-First 255 characters at index {data_point}:\n"{data.data[data_point][:255]}"')
    print(f'-Label for index {data_point}: {val_dict[data.target[data_point]]}')

In [4]:
visualize_data(0)

-First 255 characters at index 0:
"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . 
it's hard seeing arnold as mr . freeze in batman and robin , especially when he says t"
-Label for index 0: negative


In [5]:
# split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=1)

In [6]:
# Build a Pipeline using the LinearSVC classifier 
# This proved optimal after having tried many different classifiers from scikit-learn
sent_clf = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', LinearSVC(C=100, 
                    dual='auto',
                    fit_intercept=False,
                    intercept_scaling=0.5,
                    loss = 'hinge',
                    tol=0.001))
    ])

In [7]:
# Fit the Pipeline with the split training data
sent_clf.fit(docs_train,y_train)

In [8]:
# Build a grid search to find out whether unigrams or bigrams are more useful & 
# fit GridSearch object with the training data
params = {
    'vect__ngram_range':[ (1,1), (1,2) ],
}

gs_clf = GridSearchCV(
    sent_clf, 
    params, 
    cv=5,
    # Grid Search will detect how many CPUs are installed and use them all
    n_jobs=-1).fit(docs_train,y_train)

# Pretty print the best parameters from our GridSearch
print(f"Best Predictive Accuracy using GridSearch: {round((gs_clf.best_score_ * 100),2)}")
for param_name in sorted(params.keys()):
    print(f"Param {param_name} : {gs_clf.best_params_[param_name]}")

Best Predictive Accuracy using GridSearch: 84.6
Param vect__ngram_range : (1, 2)


In [9]:
import pandas as pd
# Using pandas, we can easily visualize the full GridSearch results
pd.DataFrame(gs_clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.266857,1.128886,0.543719,0.403679,"(1, 1)","{'vect__ngram_range': (1, 1)}",0.823333,0.83,0.84,0.843333,0.853333,0.838,0.010456,2
1,8.316976,2.612917,0.784345,0.38694,"(1, 2)","{'vect__ngram_range': (1, 2)}",0.836667,0.826667,0.863333,0.843333,0.86,0.846,0.013888,1


In [10]:
# Predict the outcome on the testing set and store it in a variable named y_predicted
y_predicted = sent_clf.predict(docs_test)

In [11]:
# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=data.target_names))

              precision    recall  f1-score   support

         neg       0.89      0.86      0.88       251
         pos       0.86      0.90      0.88       249

    accuracy                           0.88       500
   macro avg       0.88      0.88      0.88       500
weighted avg       0.88      0.88      0.88       500



In [12]:
# Print and plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[216  35]
 [ 26 223]]


In [18]:
# Let's create some new examples to test the robustness of the model
sentences = [
    # These two are baseline examples any model should get right
    "This is a good movie.",
    "This is a bad movie.",
    # These are examples in the first person 
    "I loved this movie.",
    "I disliked this movie.",
    # These examples refer to the contents of the movie
    "The characters in this movie are fantastic.",
    "The characters in this movie are annoying.",
    # These examples generalize the learning from movie reviews to holistic sentiment
    "The world is such a great place.",
    "The world is such a bad place."
]

# Save these sentence predictions in base_predictions
base_predictions = sent_clf.predict(sentences)

In [17]:
# Create a simple tool to visually evaluate the robustness of the model
def evaluate_predictions(predictions):
    counter=0
    for pred in predictions:
        print(f'The sentence "{sentences[counter]}" is {data.target_names[pred]}')
        counter += 1

In [19]:
evaluate_predictions(base_predictions)

The sentence "This is a good movie." is pos
The sentence "This is a bad movie." is neg
The sentence "I loved this movie." is neg
The sentence "I disliked this movie." is neg
The sentence "The characters in this movie are fantastic." is neg
The sentence "The characters in this movie are annoying." is neg
The sentence "The world is such a great place." is pos
The sentence "The world is such a bad place." is neg
