In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from distributed import Client
client = Client()

In [3]:
import re
from sklearn.pipeline import Pipeline
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
word_lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

In [4]:
from sklearn.base import TransformerMixin,BaseEstimator

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import GridSearchCV,StratifiedShuffleSplit,RandomizedSearchCV,StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [7]:
from dask_ml.model_selection import RandomizedSearchCV

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [9]:
df = dd.read_csv("training140.csv",encoding='latin-1',header=None)

In [10]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [11]:
df = df[[5,0]]
df.columns = ['statement','analysis']

In [12]:
df.dropna()

Unnamed: 0_level_0,statement,analysis
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1
,object,int64
,...,...
,...,...
,...,...


In [13]:
df.compute()

Unnamed: 0,statement,analysis
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
...,...,...
534621,Just woke up. Having no school is the best fee...,4
534622,TheWDB.com - Very cool to hear old Walt interv...,4
534623,Are you ready for your MoJo Makeover? Ask me f...,4
534624,Happy 38th Birthday to my boo of alll time!!! ...,4


In [14]:
X_train = df.statement
y_train = df.analysis
X_train.astype(str)
y_train.astype(int)


Dask Series Structure:
npartitions=3
    int32
      ...
      ...
      ...
Name: analysis, dtype: int32
Dask Name: astype, 5 graph layers

In [15]:
def preprocess(sentence):
    sentence = str(sentence)
    sentence = re.sub('<[^>]*>','',sentence)
    emoticons = re.findall(':-?[()]',sentence)
    sentence = re.sub('[\W]+|[\d]+',' ',sentence.lower()) + ' '.join(emoticons)
    return sentence

def lemmatizer(sentence):
    '''
    lem_sent = []
    for word in nlp(sentence):
        lem_sent.append(word.lemma_)
    return ''.join(lem_sent)
    '''
    return ''.join([word_lemmatizer.lemmatize(word) for word in sentence])

def stop_words_remover(sentence):
    sentence = str(sentence)
    sentence = ''.join(sentence)
    stopwords = nlp.Defaults.stop_words
    new_sent = ''
    for word_token in sentence.split():
        if word_token not in stopwords:
            new_sent = new_sent + word_token + ' '
    return new_sent
    '''
    stop_words = stopwords.words('english')
    return ''.join([word for word in sentence if word not in stop_words])
    '''

class DataCleaner(BaseEstimator,TransformerMixin):
    def __init__(self,X,y=None):
        self.X = X
        self.y = y
        
    
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X_ = X.copy()
        for row in X_.iteritems():
            row = preprocess(row)
            row = stop_words_remover(row)
            row = lemmatizer(row)
        return X_

tfidf = TfidfVectorizer(lowercase=False,
                        #preprocessor=preprocess,
                        #tokenizer=lemmatizer,
                        #stop_words=nlp.Defaults.stop_words,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True
                        )

In [16]:
clf1 = LinearSVC()
clf2 = SVC()

param_grid1 =   {'classifier__C':[0.01,0.1,1.0,10.0,100.0],
                
                }

param_grid2 =   {'classifier__C':[0.01,0.1,1.0,10.0,100.0],
                'classifier__gamma':[0.01,0.1,1.0,10.0,100.0],
                'classifier__kernel':['rbf','sigmoid']
                }

In [17]:
pipe = Pipeline([
                ('data_cleaning',DataCleaner(X_train)),
                ('vectorizer',tfidf),
                ('classifier',clf1)
        ])

In [18]:
#y_train.to_dask_array(lengths=True)

In [20]:
from dask.array import ravel
y_train = y_train.values.ravel()
y_train.compute_chunk_sizes()

Unnamed: 0,Array,Chunk
Bytes,12.21 MiB,4.08 MiB
Shape,"(1600000,)","(534626,)"
Count,5 Graph Layers,3 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 12.21 MiB 4.08 MiB Shape (1600000,) (534626,) Count 5 Graph Layers 3 Chunks Type int64 numpy.ndarray",1600000  1,

Unnamed: 0,Array,Chunk
Bytes,12.21 MiB,4.08 MiB
Shape,"(1600000,)","(534626,)"
Count,5 Graph Layers,3 Chunks
Type,int64,numpy.ndarray


In [21]:
pipe.fit(X_train,y_train)

In [24]:
input = ["i am in depression","The location was inaccurate","mr.stark i dont feel so good"]
statements = pd.Series(input)
pipe.predict(statements)

array([0, 0, 0], dtype=int64)

In [25]:
y_train_predictions = pipe.predict(X_train)

In [26]:
accuracy_score(y_train,y_train_predictions)

0.89404625

In [52]:
random_search_svc = RandomizedSearchCV(estimator = pipe, 
                                        param_distributions=param_grid1,
                                        cv=3,
                                        refit= 'acc',
                                        scoring={'acc':'accuracy',
                                                 'mse':'neg_mean_squared_error'
                                                },
                                        n_jobs=-1,
                                        return_train_score=True
                                )  

In [55]:
random_search_svc.fit(X_train,y_train)

('score-dcf829f5bb5b945f8f10d617a27fb258', 3, 2) has failed... retrying


KeyError: "('tfidfvectorizer-transform-dcf829f5bb5b945f8f10d617a27fb258', 0, 2)"