# I) General

## 1. Importations

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import codecs
import re
import os.path
import random
import string
import re
import unicodedata
from time import time
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import sklearn.naive_bayes as nb
from sklearnex import patch_sklearn
patch_sklearn() # Should speed up learning somehow 

from wordcloud import WordCloud

import warnings
warnings.filterwarnings("ignore")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# III) Movie dataset

In [2]:
def load_movies(path2data): 
    alltxts = []
    labs = []
    cpt = 0
    for cl in os.listdir(path2data): 
        for f in os.listdir(path2data+cl):
            txt = open(path2data+cl+'/'+f).read()
            alltxts.append(txt)
            labs.append(cpt)
        cpt+=1 
    return alltxts,labs

## 1. Feature and model selection

In [3]:
def get_all_data_vectorized(X, X_test, Y, Y_test, vectorizer, transformer=None):
    X_vector = vectorizer.fit_transform(X)
    if transformer is not None:
        transformer = transformer.fit(X_vector)
        X_final = transformer.transform(X_vector)
    else:
        X_final = X_vector
    X_test_vector = vectorizer.transform(X_test)
    return X_final, X_test_vector, Y, Y_test

def display_model_scores(model, X_test, Y_test, search=True):
    if search:
        best_parameters = model.best_estimator_.get_params()
        for param_name in sorted(best_parameters.keys()):
            print("--->%s: %r" % (param_name, best_parameters[param_name]))
    grid_predictions = model.predict(X_test)
    print(accuracy_score(Y_test, grid_predictions))

### --> Loading and splitting data

In [4]:
path =  "./data/movies1000/"
alltxts,alllabs = load_movies(path)
X = np.array(alltxts)
Y = np.array(alllabs)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, shuffle=True)
print(len(X_train))
print(len(X_test))

1340
660


### --> Stemming

In [5]:
# Remove comment for stemming
# print(len(X_train))
# stemmer = snowballstemmer.stemmer('french');
# for i in range(len(X_train)):
#     stemmer.stemWords(X_train[i].split()[j] for j in range(len(X_train[i].split())))
# print(np.array(X_train).shape)

### --> Feature selection depending the model selected

In [6]:
search = False
sw = stopwords.words('french')

if search :
    
    pipeline_svc = Pipeline(
        [
            ("vect", TfidfVectorizer(max_features=10_000)), # avoids overfit on train set (having more features than individuals is bad most of the time)
            ("clf", LinearSVC(max_iter=1000,                # also avoids overfit
                              C=100))                       # regularization so the model doesn't predict only one class  
        ]
    )
    pipeline_nb = Pipeline(
        [
            ("vect", TfidfVectorizer(max_features=10_000)),
            ("clf", MultinomialNB())      
        ]
    )
    pipeline_lr = Pipeline(
        [
            ("vect", TfidfVectorizer(max_features=10_000)),
            ("clf", LogisticRegression(max_iter=1000,
                                       C=100))        
        ]
    )
    parameters_vectorizer = {
        "vect__lowercase": (False,),
        "vect__stop_words": (None,),
        "vect__strip_accents": (None,),
        # "vect__use_idf": (True,False),
        # "vect__smooth_idf": (False,True),
        # "vect__sublinear_tf": (False,True),
        
        "vect__min_df": (1, 5, 15, 30),
        "vect__max_df": (0.01, 0.05, 0.1, 0.2, 0.5),
        "vect__ngram_range": [(1, 1), (1, 2)],
        }
    
    strat_kfold = StratifiedKFold(n_splits=3, shuffle=True)

    grid_search_parameters_svc = GridSearchCV(pipeline_svc, 
                                              parameters_vectorizer, 
                                              scoring="accuracy",
                                              n_jobs=8, 
                                              verbose=3, 
                                              cv=strat_kfold,
                                              refit=True
                                              )
    grid_search_parameters_nb = GridSearchCV(pipeline_nb, 
                                              parameters_vectorizer, 
                                              scoring="accuracy",
                                              n_jobs=8, 
                                              verbose=3, 
                                              cv=strat_kfold,
                                              refit=True
                                              )
    grid_search_parameters_lr = GridSearchCV(pipeline_lr, 
                                              parameters_vectorizer, 
                                              scoring="accuracy", 
                                              n_jobs=8, 
                                              verbose=3, 
                                              cv=strat_kfold,
                                              refit=True
                                              )
    
    print("============================================= SVC + VECTORIZER =============================================")
    t0 = time()
    grid_search_parameters_svc.fit(X_train, Y_train)
    print("done in %0.3fs" % (time() - t0))
    display_model_scores(grid_search_parameters_svc, X_train, Y_train) # check overfit X_train
    display_model_scores(grid_search_parameters_svc, X_test, Y_test)
    
    print("============================================= NB + VECTORIZER =============================================")
    t0 = time()
    grid_search_parameters_nb.fit(X_train, Y_train)
    print("done in %0.3fs" % (time() - t0))
    display_model_scores(grid_search_parameters_nb, X_train, Y_train)
    display_model_scores(grid_search_parameters_nb, X_test, Y_test) 
    
    print("============================================= LR + VECTORIZER =============================================")
    t0 = time()
    grid_search_parameters_lr.fit(X_train, Y_train)
    print("done in %0.3fs" % (time() - t0))
    display_model_scores(grid_search_parameters_lr, X_train, Y_train)
    display_model_scores(grid_search_parameters_lr, X_test, Y_test) 

### --> Building optimal vectorizer for each model

In [7]:
vectorizer_svc = TfidfVectorizer(lowercase=False,
                                 stop_words=None,
                                 strip_accents=None,
                                 # use_idf=True,
                                 # smooth_idf=True,
                                 # sublinear_tf=False,
                             
                                 max_features=20_000,
                                 min_df=5,
                                 max_df=0.5,
                                 ngram_range=(1,2),
                                 )

vectorizer_nb = TfidfVectorizer(lowercase=False,
                                stop_words=None,
                                strip_accents=None,
                                # use_idf=True,
                                # smooth_idf=True,
                                # sublinear_tf=False,
                             
                                max_features=20_000,
                                min_df=15,
                                max_df=0.1,
                                ngram_range=(1,2),
                                )

vectorizer_lr = TfidfVectorizer(lowercase=False,
                                stop_words=None,
                                strip_accents=None,
                                # use_idf=True,
                                # smooth_idf=True,
                                # sublinear_tf=False,
                             
                                max_features=20_000,
                                min_df=5,
                                max_df=0.5,
                                ngram_range=(1,2),
                                )

# Remove comment for stemming
# print(len(X_train))
# stemmer = snowballstemmer.stemmer('french');
# for i in range(len(X_train)):
#     stemmer.stemWords(X_train[i].split()[j] for j in range(len(X_train[i].split())))
# print(np.array(X_train).shape)

X_train_vector_svc, X_test_vector_svc, Y_train_svc, Y_test_svc = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_svc)
print(X_train_vector_svc.shape)
X_train_vector_nb, X_test_vector_nb, Y_train_nb, Y_test_nb = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_nb)
print(X_train_vector_nb.shape)
X_train_vector_lr, X_test_vector_lr, Y_train_lr, Y_test_lr = get_all_data_vectorized(X_train, X_test, Y_train, Y_test, vectorizer_lr)
print(X_train_vector_lr.shape)

(1340, 20000)
(1340, 8887)
(1340, 20000)


### --> Model selection SVC

In [8]:
search = False
if search:
    parameters = {
        'C': (1,10,50,100,200,500)    
    }
    
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True)

    optimal_svc = GridSearchCV(LinearSVC(max_iter=5000),
                               parameters, 
                               scoring="accuracy",
                               n_jobs=8, 
                               verbose=3, 
                               cv=strat_kfold,
                               refit=True
                               )

    t0 = time()
    optimal_svc.fit(X_train_vector_svc, Y_train_svc)
    print("done in %0.3fs" % (time() - t0))
    
    display_model_scores(optimal_svc, X_train_vector_svc, Y_train_svc)
    display_model_scores(optimal_svc, X_test_vector_svc, Y_test_svc)

### --> Model selection NB

In [9]:
if search:
    parameters = {
        'alpha': np.arange(0, 1, 0.05),
    }
    
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True)

    optimal_nb = GridSearchCV(MultinomialNB(),
                              parameters, 
                              scoring="accuracy",
                              n_jobs=8, 
                              verbose=3, 
                              cv=strat_kfold,
                              refit=True
                              )

    t0 = time()
    optimal_nb.fit(X_train_vector_nb, Y_train_nb)
    print("done in %0.3fs" % (time() - t0))
    
    display_model_scores(optimal_nb, X_train_vector_nb, Y_train_nb)
    display_model_scores(optimal_nb, X_test_vector_nb, Y_test_nb)

### --> Model selection LR

In [10]:
if search:
    parameters = {
        'C': (1, 10, 50, 100, 200, 500)
    }
    
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True)

    optimal_lr = GridSearchCV(LogisticRegression(max_iter=5000),
                               parameters, 
                               scoring="accuracy",
                               n_jobs=8, 
                               verbose=3, 
                               cv=strat_kfold,
                               refit=True
                               )

    t0 = time()
    optimal_lr.fit(X_train_vector_lr, Y_train_lr)
    print("done in %0.3fs" % (time() - t0))
    
    display_model_scores(optimal_lr, X_train_vector_lr, Y_train_lr)
    display_model_scores(optimal_lr, X_test_vector_lr, Y_test_lr)

### --> Test 3 optimal models without max_iter

In [11]:
final_svc = LinearSVC(max_iter=10_000,
                      C=10)

t0 = time()
final_svc.fit(X_train_vector_svc, Y_train_svc)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(final_svc, X_train_vector_svc, Y_train_svc, search=False)
display_model_scores(final_svc, X_test_vector_svc, Y_test_svc, search=False)

done in 0.334s
1.0
0.8409090909090909


In [12]:
final_nb = MultinomialNB(alpha=0.0)

t0 = time()
final_nb.fit(X_train_vector_nb, Y_train_nb)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(final_nb, X_train_vector_nb, Y_train_nb, search=False)
display_model_scores(final_nb, X_test_vector_nb, Y_test_nb, search=False)

done in 0.004s
0.9716417910447761
0.8166666666666667


In [13]:
final_lr = LogisticRegression(max_iter=10_000,
                              C=100)

t0 = time()
final_lr.fit(X_train_vector_lr, Y_train_lr)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(final_lr, X_train_vector_lr, Y_train_lr, search=False)
display_model_scores(final_lr, X_test_vector_lr, Y_test_lr, search=False) 

done in 0.589s
1.0
0.8469696969696969


### --> Testing voting classifier

In [14]:
svc_for_vote = SVC(kernel='linear', # VotingClassifier doesn't work with linearSVC() module
                   probability=True,
                   max_iter=10_000,
                   C=10)

In [15]:
voting_clf_soft = VotingClassifier(estimators=[
                                            ('lr', final_lr), 
                                            ('nb', final_nb),
                                            ('svc', svc_for_vote)
                                            ],
                                  voting='soft',
                                  n_jobs=1,
                                  verbose=3)

t0 = time()
voting_clf_soft.fit(X_train_vector_lr, Y_train_lr)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(voting_clf_soft, X_train_vector_lr, Y_train_lr, search=False)
display_model_scores(voting_clf_soft, X_test_vector_lr, Y_test_lr, search=False)

[Voting] ....................... (1 of 3) Processing lr, total=   0.4s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing svc, total=  32.8s
done in 33.252s
1.0
0.8469696969696969


In [16]:
voting_clf_hard = VotingClassifier(estimators=[
                                            ('lr', final_lr), 
                                            ('nb', final_nb),
                                            ('svc', final_svc)
                                            ],
                                  voting='hard',
                                  n_jobs=1,
                                  verbose=3)

t0 = time()
voting_clf_hard.fit(X_train_vector_lr, Y_train_lr)
print("done in %0.3fs" % (time() - t0))
    
display_model_scores(voting_clf_hard, X_train_vector_lr, Y_train_lr, search=False)
display_model_scores(voting_clf_hard, X_test_vector_lr, Y_test_lr, search=False)

[Voting] ....................... (1 of 3) Processing lr, total=   0.4s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing svc, total=   0.3s
done in 0.684s
1.0
0.8484848484848485


## 2. Final training without max_iter

In [17]:
path =  "./data/movies1000/"
alltxts,alllabs = load_movies(path)
X = np.array(alltxts)
Y = np.array(alllabs)
X_train_final, X_test_final, Y_train_final, Y_test_final = train_test_split(X, Y, test_size=0.33, shuffle=True)
print(len(X_train_final))
print(len(X_test_final))

1340
660


In [18]:
X_final_vector_svc = vectorizer_svc.fit_transform(X_train_final)
X_final_vector_nb = vectorizer_nb.fit_transform(X_train_final)
X_final_vector_lr = vectorizer_lr.fit_transform(X_train_final)

X_test_vector_svc = vectorizer_svc.transform(X_test_final)
X_test_vector_nb = vectorizer_nb.transform(X_test_final)
X_test_vector_lr = vectorizer_lr.transform(X_test_final)

t0 = time()
final_svc.fit(X_final_vector_svc,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
final_nb.fit(X_final_vector_nb,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
final_lr.fit(X_final_vector_lr,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
voting_clf_soft.fit(X_final_vector_lr,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
voting_clf_hard.fit(X_final_vector_lr,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

 done in 0.298s
 done in 0.003s
 done in 0.166s
[Voting] ....................... (1 of 3) Processing lr, total=   0.3s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing svc, total=  31.9s
 done in 32.199s
[Voting] ....................... (1 of 3) Processing lr, total=   0.2s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing svc, total=   0.3s
 done in 0.491s


In [19]:
display_model_scores(final_svc, X_test_vector_svc, Y_test_final, search=False)
display_model_scores(final_nb, X_test_vector_nb, Y_test_final, search=False)
display_model_scores(final_lr, X_test_vector_lr, Y_test_final, search=False)
display_model_scores(voting_clf_soft, X_test_vector_lr, Y_test_final, search=False)
display_model_scores(voting_clf_hard, X_test_vector_lr, Y_test_final, search=False)

0.8651515151515151
0.8348484848484848
0.8681818181818182
0.8606060606060606
0.8666666666666667


## 3. Training on whole dataset

In [20]:
path =  "./data/movies1000/"
alltxts,alllabs = load_movies(path)
X = np.array(alltxts)
Y_train_final = np.array(alllabs)
print(len(X))
print(len(Y))

2000
2000


In [21]:
X_final_vector_svc = vectorizer_svc.fit_transform(X)
X_final_vector_nb = vectorizer_nb.fit_transform(X)
X_final_vector_lr = vectorizer_lr.fit_transform(X)

t0 = time()
final_svc.fit(X_final_vector_svc,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
final_nb.fit(X_final_vector_nb,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
final_lr.fit(X_final_vector_lr,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
voting_clf_soft.fit(X_final_vector_lr,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

t0 = time()
voting_clf_hard.fit(X_final_vector_lr,  Y_train_final)
print(" done in %0.3fs" % (time() - t0))

 done in 0.388s
 done in 0.004s
 done in 0.211s
[Voting] ....................... (1 of 3) Processing lr, total=   0.3s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing svc, total= 1.1min
 done in 67.815s
[Voting] ....................... (1 of 3) Processing lr, total=   0.2s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing svc, total=   0.4s
 done in 0.579s


## 4. Computing final test predictions

In [22]:
path = "./data/testSentiment.txt"
f = open(path,encoding="utf8")
X_test_final = np.array(f.readlines())

In [23]:
print(len(X_test_final))
print(X_test_final[2])

25000
Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son. The luxury jetliner takes off as planned but mid-air the plane is hi-jacked by the co-pilot Chambers (Robert Foxworth) & his two accomplice's Banker (Monte Markham) & Wilson (Michael Pataki) who knock the passengers & crew out with sleeping gas, they plan to steal the valuable cargo & land on a disused plane strip on an isolated island but while making his descent Chambers almost hits an oil rig in the Ocean & loses control of the plane sending it crashing into the sea where it sinks to the bottom right bang in the middle of the Bermuda Triangle. With air in short supply, water leaking in & having flown over 200 miles off course the problems mount

### --> SVC

In [24]:
final_svc = LinearSVC(max_iter=10_000,
                      C=10)

vectorizer_svc_final = TfidfVectorizer(lowercase=False,
                                     stop_words=None,
                                     strip_accents=None,
                                     # use_idf=True,
                                     # smooth_idf=True,
                                     # sublinear_tf=False,

                                     max_features=10_000,
                                     min_df=5,
                                     max_df=0.5,
                                     ngram_range=(1,2),
                                     )

In [25]:
path =  "./data/movies1000/"
alltxts,alllabs = load_movies(path)
X = np.array(alltxts)
Y = np.array(alllabs)

X_final_vector = vectorizer_svc_final.fit_transform(X)
final_svc.fit(X_final_vector, Y)

path = "./data/testSentiment.txt"
f = open(path,encoding="utf8")
X_test_final = np.array(f.readlines())
X_test_vector = vectorizer_svc_final.transform(X_test_final)
final_pred_svc = final_svc.predict(X_test_vector)

In [26]:
save=True
if save :
    f = open("../TME1/output/movies_pred_svc.txt", "w")
    preds = ""
    for i in range(len(final_pred_svc)):
        preds += str(final_pred_svc[i]) + "\n"
    f.write(preds)
    f.close()

### --> LR

In [27]:
final_lr = LogisticRegression(max_iter=10_000,
                              C=100)

vectorizer_lr_final = TfidfVectorizer(lowercase=False,
                                    stop_words=None,
                                    strip_accents=None,
                                    # use_idf=True,
                                    # smooth_idf=True,
                                    # sublinear_tf=False,

                                    max_features=10_000,
                                    min_df=5,
                                    max_df=0.5,
                                    ngram_range=(1,2),
                                    )

In [28]:
path =  "./data/movies1000/"
alltxts,alllabs = load_movies(path)
X = np.array(alltxts)
Y = np.array(alllabs)

X_final_vector = vectorizer_lr_final.fit_transform(X)
final_lr.fit(X_final_vector, Y)

path = "./data/testSentiment.txt"
f = open(path,encoding="utf8")
X_test_final = np.array(f.readlines())
X_test_vector = vectorizer_lr_final.transform(X_test_final)
final_pred_lr = final_lr.predict(X_test_vector)

In [29]:
save=True
if save :
    f = open("../TME1/output/movies_pred_lr.txt", "w")
    preds = ""
    for i in range(len(final_pred_lr)):
        preds += str(final_pred_lr[i]) + "\n"
    f.write(preds)
    f.close()

### --> SOFT

In [30]:
vectorizer_lr_final = TfidfVectorizer(lowercase=False,
                                    stop_words=None,
                                    strip_accents=None,
                                    # use_idf=True,
                                    # smooth_idf=True,
                                    # sublinear_tf=False,

                                    max_features=10_000,
                                    min_df=5,
                                    max_df=0.5,
                                    ngram_range=(1,2),
                                    )

svc_for_vote = SVC(kernel='linear', 
                   probability=True,
                   max_iter=10_000,
                   C=10)

final_lr = LogisticRegression(max_iter=10_000,
                              C=100)

final_nb = MultinomialNB(alpha=0.0)

voting_clf_soft = VotingClassifier(estimators=[
                                            ('lr', final_lr), 
                                            ('nb', final_nb),
                                            ('svc', svc_for_vote)
                                            ],
                                  voting='soft',
                                  n_jobs=1)

t0 = time()
voting_clf_soft.fit(X_train_vector_lr, Y_train_lr)
print("done in %0.3fs" % (time() - t0))

done in 31.590s


In [31]:
path = "./data/testSentiment.txt"
f = open(path,encoding="utf8")
X_test_final = np.array(f.readlines())

X_test_vector = vectorizer_lr.transform(X_test_final)
pred = voting_clf_soft.predict(X_test_vector)

In [32]:
save=True
if save :
    f = open("../TME1/output/movies_pred_soft.txt", "w")
    preds = ""
    for i in range(len(pred)):
        preds += str(pred[i]) + "\n"
    f.write(preds)
    f.close()

### --> HARD

In [33]:
vectorizer_lr_final = TfidfVectorizer(lowercase=False,
                                    stop_words=None,
                                    strip_accents=None,
                                    # use_idf=True,
                                    # smooth_idf=True,
                                    # sublinear_tf=False,

                                    max_features=10_000,
                                    min_df=5,
                                    max_df=0.5,
                                    ngram_range=(1,2),
                                    )

final_svc = LinearSVC(max_iter=10_000,
                      C=10)

final_lr = LogisticRegression(max_iter=10_000,
                              C=100)

final_nb = MultinomialNB(alpha=0.0)

voting_clf_hard = VotingClassifier(estimators=[
                                            ('lr', final_lr), 
                                            ('nb', final_nb),
                                            ('svc', final_svc)
                                            ],
                                  voting='hard',
                                  n_jobs=1)

t0 = time()
voting_clf_hard.fit(X_train_vector_lr, Y_train_lr)
print("done in %0.3fs" % (time() - t0))

done in 0.658s


In [34]:
path = "./data/testSentiment.txt"
f = open(path,encoding="utf8")
X_test_final = np.array(f.readlines())

X_test_vector = vectorizer_lr.transform(X_test_final)
pred = voting_clf_hard.predict(X_test_vector)

In [35]:
save=True
if save :
    f = open("../TME1/output/movies_pred_hard.txt", "w")
    preds = ""
    for i in range(len(pred)):
        preds += str(pred[i]) + "\n"
    f.write(preds)
    f.close()

## 5. Results

In [None]:
# BEST PERFORMANCE LR : 83.2% ACC ON TEST ACCURACY