## First import dependencies and data from SteevEbu.preproc

In [3]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
df = pd.read_csv('../notebooks/processed_data_steeve_updated.csv')

In [5]:
df = df.dropna().reset_index().drop(columns='index')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61442 entries, 0 to 61441
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  61442 non-null  int64 
 1   content     61442 non-null  object
 2   sentiment   61442 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


## Dataset is much too large, so at the moment I am using a 5% sample to test the model.

In [7]:
df = df.sample(frac=0.05)

In [8]:
df

Unnamed: 0.1,Unnamed: 0,content,sentiment
43385,43393,face mask hehe,fun
24671,24674,leading dull life friday babysitting,neutral
16322,16322,feel stressed love feeling calming spirit heav...,sadness
51093,51103,chillen started twitter,worry
26368,26374,retrorewind omg im work cant listen,neutral
...,...,...,...
34700,34707,sad gone show mcfly fan,sadness
41205,41213,thenewbradie teh part he liek feeling girl fee...,surprise
27161,27167,therotarm first thought ive never even watched...,neutral
9607,9607,feel like train smart take easy back former se...,happiness


## Encode the categories numerically

In [9]:
df.sentiment = pd.Categorical(df.sentiment)
df['code'] = df.sentiment.cat.codes

In [15]:
df.code

43385     4
24671     8
16322    10
51093    12
26368     8
         ..
34700    10
41205    11
27161     8
9607      5
38694     0
Name: code, Length: 3072, dtype: int8

## Create variables, and split into train and test data

In [11]:
y = df.code
X = df.content

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [17]:
X_train

31438    shakamaiden damn dude dia e eu tenho curso na ...
10638    clump everybody together weird way feel liked ...
40555    yummy cocoabebe know right guess oldest doesnt...
11238    guess right feel way dont know lately havent f...
51079        grazzini youve featured ykyat httpykyatcomnnd
                               ...                        
27099    freosan dad bury six week ago became left coll...
43916    msluce oh love well mcr song great happy star ...
5380                               im feeling kind naughty
47148               dont look toe put shoe painted big toe
49786                  first day research edmonton general
Name: content, Length: 2150, dtype: object

## Custom transformer courtesy of Christophe to ensure the model has the correct input matrix

In [11]:
class ToArray(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

## Pipeline everything

In [77]:
# Create Pipeline
# for the first element in the pipeline apply CountVectorizer() or TfidfVectorizer()
# for the second element apply GaussianNB() or SVC()

pipeline = make_pipeline(
    TfidfVectorizer(),
    ToArray(),
    SVC()
)

# Set parameters to search
params = {
   'tfidf__ngram_range': ((1,1), (2,2), (3,3)),
   'nb__alpha': (0.1,1,10),
}
#Perform grid search
grid_search = GridSearchCV(pipeline, params, n_jobs=-1, 
                           verbose=1, scoring =["accuracy"], 
                           cv=5, refit=False)

random_search = RandomizedSearchCV(pipeline, params, n_jobs=-1, 
                           verbose=1, scoring =["accuracy"], 
                           refit=True, cv=5)
# # grid_search.fit(X_train,y)

In [78]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('toarray', <__main__.ToArray object at 0x13deee4c0>),
                ('svc', SVC())])

In [79]:
pipeline.score(X_test, y_test)

0.34490238611713664

In [80]:
pipeline.predict(X_test)

array([ 5,  5,  5,  5,  8, 10,  5,  8,  5,  5,  5,  5, 10,  5, 10,  5,  8,
       10, 12,  8, 11,  5, 12,  5, 10,  5,  5,  5,  5, 12, 10,  5, 10,  8,
        5,  5,  5,  8, 10,  5,  5,  5,  5,  5,  5,  8,  5, 10,  8,  5,  8,
        5, 12,  5,  5,  5,  5, 10,  8,  5,  8,  5, 10,  5,  8,  5,  5, 10,
        8,  8,  8,  8,  5,  5, 10,  5, 12,  5,  5,  8, 10,  5,  5,  5,  5,
        5, 10, 12, 10,  5,  5,  5,  7,  5, 10,  5, 10,  5, 10, 10, 10,  8,
       10, 10,  5,  7,  5, 10,  8,  8,  5, 10,  8, 12,  8,  8,  5,  5,  5,
        5,  8,  8,  8,  8,  8,  5, 12,  5,  5, 10,  5,  5,  8,  8, 12,  5,
        8,  8,  5, 10,  8, 12, 10,  5,  5,  5,  5, 10,  5, 10, 10, 12,  5,
       10,  5, 10,  7,  5, 10,  5,  5,  8,  8, 10,  7,  5,  5,  5, 10,  5,
       10, 12, 10,  5,  5, 10,  8, 10,  8,  5,  8, 12, 12,  5, 10,  5,  8,
        5,  5, 10,  5,  8,  8,  5,  5,  5,  5,  5,  5, 10,  5,  8,  5, 10,
       10,  5,  8, 10,  5,  5, 10,  5, 10,  5,  8,  5,  8,  8,  5,  8, 10,
       10, 12, 12,  8,  5

## Metrics and scores

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot
    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix
    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']
    title:        the text to display at the top of the matrix
    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues
    normalize:    If False, plot the raw numbers
                  If True, plot the proportions
    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph
    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    import numpy as np
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy
    if cmap is None:
        cmap = plt.get_cmap('Blues')
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
# Now plot the confusion matrix 😉
plot_confusion_matrix(confusion_matrix(y_test ,y_pred), ['Close to Failure', 'Not Close Failure'])