In [60]:
import os
import re
import random
import pandas as pd
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import SelectPercentile, chi2,mutual_info_classif,f_classif
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [47]:
 # Remove all non-alphanumeric characters
    X_train = [re.sub(r'\W+', ' ', text) for text in X_train]
    #X_train = [re.sub('@[^\s]+','',text) for text in X_train]
    X_train = [re.sub(r'http\S+', '', text) for text in X_train]
    
    X_val = [re.sub(r'\W+', ' ', text) for text in X_val]
    X_val = [re.sub('@[^\s]+','',text) for text in X_val]
    X_val = [re.sub(r'http\S+', '', text) for text in X_val]
    
    X_test = [re.sub(r'\W+', ' ', text) for text in X_test]
    X_test = [re.sub('@[^\s]+','',text) for text in X_test]
    X_test = [re.sub(r'http\S+', '', text) for text in X_test]



In [51]:
string = "yoooooouuuuu are suuuch a piiiig"
re.sub(r'(\w)\1+',r'\1\1', string)

'yoouu are suuch a piig'

In [57]:
import pandas as pd

def preprocess_twitter():

    # loads the training set
    data = pd.read_csv("./twitter/train.csv", encoding = "latin-1", header = None)
    train = data.sample(n=80000)
    val = data.sample(n=16000)

    X_train = train.iloc[:,-1].values
    y_train = train.iloc[:,0].values
    y_train[y_train == 4] = 1

    X_val = val.iloc[:,-1].values
    y_val = val.iloc[:,0].values
    y_val[y_val == 4] = 1

    # Loads the test set
    data = pd.read_csv("./twitter/test.csv", encoding = "latin-1", header = None)
    X_test = data.iloc[:,5].values
    y_test = data.iloc[:,0].values
    y_test[y_test == 4] = 1
            
    # Replace usernames with @USERNAME
    username = re.compile(r'@([A-Za-z0-9_]+)')
    X_train = [username.sub("@USERNAME", text) for text in X_train]
    X_val = [username.sub("@USERNAME", text) for text in X_val]
    X_test = [username.sub("@USERNAME", text) for text in X_test]
    # Replace urls with URL
    url = re.compile(r'http\S+')
    X_train = [url.sub("URL.", text) for text in X_train]
    X_val = [url.sub("URL.", text) for text in X_val]
    X_test = [url.sub("URL.", text) for text in X_test]
    
    # Replace repeated characters
    X_train = [re.sub(r'(\w)\1+',r'\1\1', text) for text in X_train]
    X_val = [re.sub(r'(\w)\1+', r'\1\1', text) for text in X_val]
    X_test = [re.sub(r'(\w)\1+', r'\1\1', text) for text in X_test]
    
    return (X_train, y_train, X_val, y_val, X_test, y_test)

In [58]:
X_train, y_train, X_val, y_val, X_test, y_test=preprocess_twitter()

In [67]:
pclf = Pipeline([
    ('vect', CountVectorizer(binary=True, stop_words=None, ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', MultinomialNB()),
])

pclf.fit(X_train, y_train)
y_pred = pclf.predict(X_val)
print(metrics.classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79      8057
           1       0.80      0.76      0.78      7943

   micro avg       0.79      0.79      0.79     16000
   macro avg       0.79      0.79      0.79     16000
weighted avg       0.79      0.79      0.79     16000



In [66]:
pclf = Pipeline([
    ('vect', CountVectorizer(binary=True, stop_words=None, ngram_range=(1,1))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(penalty = 'l2', tol= 1e-2, random_state=0, C=1)),
])

pclf.fit(X_train, y_train)
y_pred = pclf.predict(X_val)
print(metrics.classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      8057
           1       0.80      0.81      0.80      7943

   micro avg       0.80      0.80      0.80     16000
   macro avg       0.80      0.80      0.80     16000
weighted avg       0.80      0.80      0.80     16000



In [40]:
# Pipeline and Grid search(feature extraction - normalization - feature selection - classification)
svm = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,2),sublinear_tf=True)),
                    ("normalizer", Normalizer()),
                      ("selectFeature",SelectPercentile(score_func=chi2)),
                    ("svc", LinearSVC(penalty = 'l2', tol= 1e-2, random_state=0)),
                   ])

parameters = {
    'svc__C':[1e-5,1e-3,1,1e3,1e5,1e7,1e10,1e15],
    'selectFeature__percentile':[80,90, 95]
}

In [41]:
gd_svm= GridSearchCV(estimator=svm, param_grid=parameters, cv=5, verbose=3)
gd_svm.fit(X_train, y_train)
svm_predict = gd_svm.predict(X_val)
accuracy = gd_svm.score(X_val, y_val)
best_param = gd_svm.best_params_
best_score = gd_svm.best_score_
best_estim = gd_svm.best_estimator_
result = gd_svm.cv_results_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] selectFeature__percentile=80, svc__C=1e-05 ......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  selectFeature__percentile=80, svc__C=1e-05, score=0.5034375, total=   6.0s
[CV] selectFeature__percentile=80, svc__C=1e-05 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.1s remaining:    0.0s


[CV]  selectFeature__percentile=80, svc__C=1e-05, score=0.50325, total=   5.9s
[CV] selectFeature__percentile=80, svc__C=1e-05 ......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.0s remaining:    0.0s


[CV]  selectFeature__percentile=80, svc__C=1e-05, score=0.503375, total=   6.0s
[CV] selectFeature__percentile=80, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=80, svc__C=1e-05, score=0.5034375, total=   6.0s
[CV] selectFeature__percentile=80, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=80, svc__C=1e-05, score=0.50325, total=   6.0s
[CV] selectFeature__percentile=80, svc__C=0.001 ......................
[CV]  selectFeature__percentile=80, svc__C=0.001, score=0.721125, total=   6.0s
[CV] selectFeature__percentile=80, svc__C=0.001 ......................
[CV]  selectFeature__percentile=80, svc__C=0.001, score=0.729, total=   6.0s
[CV] selectFeature__percentile=80, svc__C=0.001 ......................
[CV]  selectFeature__percentile=80, svc__C=0.001, score=0.7330625, total=   6.0s
[CV] selectFeature__percentile=80, svc__C=0.001 ......................
[CV]  selectFeature__percentile=80, svc__C=0.001, score=0.7193125, total=   5.9s
[CV] selectFeat



[CV]  selectFeature__percentile=80, svc__C=1000.0, score=0.7444375, total=  31.2s
[CV] selectFeature__percentile=80, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=80, svc__C=1000.0, score=0.7408125, total=  30.3s
[CV] selectFeature__percentile=80, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=80, svc__C=1000.0, score=0.7473125, total=  30.9s
[CV] selectFeature__percentile=80, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=80, svc__C=1000.0, score=0.732625, total=  30.7s
[CV] selectFeature__percentile=80, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=80, svc__C=1000.0, score=0.7414375, total=  30.5s
[CV] selectFeature__percentile=80, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=80, svc__C=100000.0, score=0.743375, total=  31.3s
[CV] selectFeature__percentile=80, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=80, svc__C=100000.0, score=0.7379375, total=  31.2s
[CV] selectFeature__percentile=80, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=80, svc__C=100000.0, score=0.744375, total=  31.1s
[CV] selectFeature__percentile=80, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=80, svc__C=100000.0, score=0.7283125, total=  30.7s
[CV] selectFeature__percentile=80, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=80, svc__C=100000.0, score=0.7391875, total=  32.2s
[CV] selectFeature__percentile=80, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=80, svc__C=10000000.0, score=0.74325, total=  31.4s
[CV] selectFeature__percentile=80, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=80, svc__C=10000000.0, score=0.7371875, total=  31.4s
[CV] selectFeature__percentile=80, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=80, svc__C=10000000.0, score=0.7444375, total=  31.2s
[CV] selectFeature__percentile=80, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=80, svc__C=10000000.0, score=0.7295625, total=  31.7s
[CV] selectFeature__percentile=80, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=80, svc__C=10000000.0, score=0.739125, total=  32.0s
[CV] selectFeature__percentile=80, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=80, svc__C=10000000000.0, score=0.74325, total=  31.8s
[CV] selectFeature__percentile=80, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=80, svc__C=10000000000.0, score=0.7371875, total=  31.0s
[CV] selectFeature__percentile=80, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=80, svc__C=10000000000.0, score=0.7445625, total=  31.0s
[CV] selectFeature__percentile=80, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=80, svc__C=10000000000.0, score=0.7295625, total=  31.2s
[CV] selectFeature__percentile=80, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=80, svc__C=10000000000.0, score=0.739125, total=  38.7s
[CV] selectFeature__percentile=80, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=80, svc__C=1000000000000000.0, score=0.74325, total=  28.9s
[CV] selectFeature__percentile=80, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=80, svc__C=1000000000000000.0, score=0.7371875, total=  27.7s
[CV] selectFeature__percentile=80, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=80, svc__C=1000000000000000.0, score=0.7445625, total=  29.3s
[CV] selectFeature__percentile=80, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=80, svc__C=1000000000000000.0, score=0.7295625, total=  31.1s
[CV] selectFeature__percentile=80, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=80, svc__C=1000000000000000.0, score=0.739125, total=  31.1s
[CV] selectFeature__percentile=90, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=90, svc__C=1e-05, score=0.5034375, total=   5.9s
[CV] selectFeature__percentile=90, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=90, svc__C=1e-05, score=0.50325, total=   6.0s
[CV] selectFeature__percentile=90, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=90, svc__C=1e-05, score=0.503375, total=   5.9s
[CV] selectFeature__percentile=90, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=90, svc__C=1e-05, score=0.5034375, total=   5.9s
[CV] selectFeature__percentile=90, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=90, svc__C=1e-05, score=0.50325, total=   5.9s
[CV] selectFeature__percentile=90, svc__C=0.001 ......................
[CV]  selectFeature__percentile=90, svc__C=0.001, score=0.7214375, total=   5.8s




[CV]  selectFeature__percentile=90, svc__C=1000.0, score=0.7543125, total=  32.9s
[CV] selectFeature__percentile=90, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=90, svc__C=1000.0, score=0.74925, total=  31.8s
[CV] selectFeature__percentile=90, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=90, svc__C=1000.0, score=0.7550625, total=  32.2s
[CV] selectFeature__percentile=90, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=90, svc__C=1000.0, score=0.744375, total=  32.7s
[CV] selectFeature__percentile=90, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=90, svc__C=1000.0, score=0.751875, total=  32.2s
[CV] selectFeature__percentile=90, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=90, svc__C=100000.0, score=0.752875, total=  32.5s
[CV] selectFeature__percentile=90, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=90, svc__C=100000.0, score=0.746125, total=  33.6s
[CV] selectFeature__percentile=90, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=90, svc__C=100000.0, score=0.7525, total=  32.5s
[CV] selectFeature__percentile=90, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=90, svc__C=100000.0, score=0.7401875, total=  32.9s
[CV] selectFeature__percentile=90, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=90, svc__C=100000.0, score=0.74975, total=  32.4s
[CV] selectFeature__percentile=90, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=90, svc__C=10000000.0, score=0.7523125, total=  32.6s
[CV] selectFeature__percentile=90, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=90, svc__C=10000000.0, score=0.7459375, total=  32.7s
[CV] selectFeature__percentile=90, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=90, svc__C=10000000.0, score=0.7510625, total=  32.1s
[CV] selectFeature__percentile=90, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=90, svc__C=10000000.0, score=0.74075, total=  34.0s
[CV] selectFeature__percentile=90, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=90, svc__C=10000000.0, score=0.7490625, total=  31.9s
[CV] selectFeature__percentile=90, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=90, svc__C=10000000000.0, score=0.7523125, total=  33.4s
[CV] selectFeature__percentile=90, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=90, svc__C=10000000000.0, score=0.7461875, total=  33.3s
[CV] selectFeature__percentile=90, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=90, svc__C=10000000000.0, score=0.752625, total=  32.6s
[CV] selectFeature__percentile=90, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=90, svc__C=10000000000.0, score=0.74075, total=  32.2s
[CV] selectFeature__percentile=90, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=90, svc__C=10000000000.0, score=0.7490625, total=  32.7s
[CV] selectFeature__percentile=90, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=90, svc__C=1000000000000000.0, score=0.7523125, total=  34.1s
[CV] selectFeature__percentile=90, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=90, svc__C=1000000000000000.0, score=0.7461875, total=  32.2s
[CV] selectFeature__percentile=90, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=90, svc__C=1000000000000000.0, score=0.752625, total=  32.4s
[CV] selectFeature__percentile=90, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=90, svc__C=1000000000000000.0, score=0.74075, total=  32.4s
[CV] selectFeature__percentile=90, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=90, svc__C=1000000000000000.0, score=0.749125, total=  32.2s
[CV] selectFeature__percentile=95, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=95, svc__C=1e-05, score=0.5034375, total=   6.0s
[CV] selectFeature__percentile=95, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=95, svc__C=1e-05, score=0.50325, total=   5.9s
[CV] selectFeature__percentile=95, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=95, svc__C=1e-05, score=0.503375, total=   5.9s
[CV] selectFeature__percentile=95, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=95, svc__C=1e-05, score=0.5034375, total=   6.0s
[CV] selectFeature__percentile=95, svc__C=1e-05 ......................
[CV]  selectFeature__percentile=95, svc__C=1e-05, score=0.50325, total=   6.2s
[CV] selectFeature__percentile=95, svc__C=0.001 ......................
[CV]  selectFeature__percentile=95, svc__C=0.001, score=0.7226875, total=   5.9s




[CV]  selectFeature__percentile=95, svc__C=1000.0, score=0.765, total=  34.0s
[CV] selectFeature__percentile=95, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=95, svc__C=1000.0, score=0.7633125, total=  33.9s
[CV] selectFeature__percentile=95, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=95, svc__C=1000.0, score=0.771, total=  34.6s
[CV] selectFeature__percentile=95, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=95, svc__C=1000.0, score=0.7568125, total=  34.6s
[CV] selectFeature__percentile=95, svc__C=1000.0 .....................




[CV]  selectFeature__percentile=95, svc__C=1000.0, score=0.7660625, total=  34.6s
[CV] selectFeature__percentile=95, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=95, svc__C=100000.0, score=0.7631875, total=  35.2s
[CV] selectFeature__percentile=95, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=95, svc__C=100000.0, score=0.7638125, total=  34.8s
[CV] selectFeature__percentile=95, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=95, svc__C=100000.0, score=0.7703125, total=  34.8s
[CV] selectFeature__percentile=95, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=95, svc__C=100000.0, score=0.754875, total=  34.5s
[CV] selectFeature__percentile=95, svc__C=100000.0 ...................




[CV]  selectFeature__percentile=95, svc__C=100000.0, score=0.7651875, total=  33.9s
[CV] selectFeature__percentile=95, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=95, svc__C=10000000.0, score=0.764375, total=  34.6s
[CV] selectFeature__percentile=95, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=95, svc__C=10000000.0, score=0.7629375, total=  34.2s
[CV] selectFeature__percentile=95, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=95, svc__C=10000000.0, score=0.7699375, total=  34.9s
[CV] selectFeature__percentile=95, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=95, svc__C=10000000.0, score=0.754625, total=  36.3s
[CV] selectFeature__percentile=95, svc__C=10000000.0 .................




[CV]  selectFeature__percentile=95, svc__C=10000000.0, score=0.7651875, total=  34.8s
[CV] selectFeature__percentile=95, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=95, svc__C=10000000000.0, score=0.763375, total=  33.8s
[CV] selectFeature__percentile=95, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=95, svc__C=10000000000.0, score=0.7629375, total=  33.3s
[CV] selectFeature__percentile=95, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=95, svc__C=10000000000.0, score=0.7699375, total=  33.7s
[CV] selectFeature__percentile=95, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=95, svc__C=10000000000.0, score=0.754625, total=  36.5s
[CV] selectFeature__percentile=95, svc__C=10000000000.0 ..............




[CV]  selectFeature__percentile=95, svc__C=10000000000.0, score=0.7656875, total=  40.5s
[CV] selectFeature__percentile=95, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=95, svc__C=1000000000000000.0, score=0.763375, total=  38.1s
[CV] selectFeature__percentile=95, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=95, svc__C=1000000000000000.0, score=0.7629375, total=  33.0s
[CV] selectFeature__percentile=95, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=95, svc__C=1000000000000000.0, score=0.7699375, total=  34.9s
[CV] selectFeature__percentile=95, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=95, svc__C=1000000000000000.0, score=0.754625, total=35.5min
[CV] selectFeature__percentile=95, svc__C=1000000000000000.0 .........




[CV]  selectFeature__percentile=95, svc__C=1000000000000000.0, score=0.7656875, total=  31.3s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 86.7min finished


In [46]:
print(best_score, best_param,best_estim)


0.7912625 {'selectFeature__percentile': 80, 'svc__C': 1} Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,..._hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.01, verbose=0))])


In [45]:
params = [para['svc__C'] for para in result['params']]
test_score = result['mean_test_score']
score_time = result['mean_score_time']
train_score = result['mean_train_score']

print("C-value: ", params[0])
print("Percentile:", params[1])
print("Test score: ",test_score)
print("Runtime: ",score_time)
print("Train score: ",train_score)

C-value:  1e-05
Percentile: 0.001
Test score:  [0.50335   0.7257    0.7912625 0.741325  0.7386375 0.7387125 0.7387375
 0.7387375 0.50335   0.72585   0.7912625 0.750975  0.7482875 0.747825
 0.7481875 0.7482    0.50335   0.7260375 0.7901375 0.7644375 0.763475
 0.7634125 0.7633125 0.7633125]
Runtime:  [0.74400916 0.73598428 0.72736764 0.72899423 0.75782781 0.72694292
 0.79923339 0.69388838 0.70941944 0.70418563 0.74543047 0.74576077
 0.76190529 0.73383369 0.74584422 0.73997598 0.73948283 0.73055391
 0.72933989 0.74592824 0.73305616 0.73157077 0.82634764 0.74657998]
Train score:  [0.50335625 0.7384375  0.98850937 0.99959375 0.99956563 0.99956563
 0.99956563 0.99956563 0.50335625 0.739075   0.99286562 0.99962812
 0.99960625 0.9996     0.9996125  0.9996125  0.50335625 0.73975937
 0.99561875 0.99969688 0.9996875  0.99969062 0.9996875  0.9996875 ]


array(['gonna start my weekend of revising for summere exams nowm wish me luck ',
       'sigh..staring at another very tight month..when will hikes come back ',
       "@MissBossi2u yeah I feel u!.. Well in glad you're doing okay ",
       ...,
       "3.0 isn't a midnight release it seems must be a 9am San fran time one. ",
       '@Extremo you really need to buy Etihad Stafium tickets a month out if you want good seats. ',
       'Is my family mad at me? '], dtype=object)