In [1]:
import pandas as pd
import numpy as np
import spacy
import string
from bs4 import BeautifulSoup             
import re

from sklearn.metrics import confusion_matrix as cm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier as rforest

In [2]:
# spacy.require_gpu()
# gpu = spacy.prefer_gpu()
# print('GPU:', gpu)

In [3]:
# data avialable here : https://www.kaggle.com/competitions/word2vec-nlp-tutorial/data
labtrain = pd.read_csv('../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', compression='zip', sep='\t', header=0)
unlabtrain = pd.read_csv('../input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip', compression='zip', sep='\t',quoting=3)
test = pd.read_csv('../input/word2vec-nlp-tutorial/testData.tsv.zip', compression='zip', sep='\t')

In [4]:
nlp = spacy.load("en_core_web_lg", exclude=["parser", "senter", "ner"])
nlp.Defaults.stop_words -= {"never", "not","again","more"} #exclude this words

<br>
<br>

# **Bag of words** 
<br>

## 1. data Preparation

* tokenization
* lemmatization
* remove punctuation, numbers and undefined words


In [5]:
%%time
def clean_text(x): # clean from html tags and punctuation
    x= BeautifulSoup(x).get_text()
    for i in x:
        if (i in string.punctuation.replace("'","")):
            x=x.replace(i,"")
    return x
labtrain.review = labtrain.review.apply(clean_text)
test.review = test.review.apply(clean_text)

def prep_text(x): # lemmatize text and return only relevant words
    for i, doc in enumerate(nlp.pipe(x)):
        text=[token.lemma_ for token in doc if  not token.is_punct and not token.like_num and token.is_alpha and token.has_vector]
        x[i]= " ".join(text)
    return x


prep_text(labtrain.review)
prep_text(test.review)
xtrain = labtrain.review
xtest = test.review
y = labtrain.sentiment

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


CPU times: user 14min 9s, sys: 9.16 s, total: 14min 19s
Wall time: 14min 19s


In [6]:
labtrain.review[24997]

'Guy be a loser can get girl need to build up be pick on by strong more successful guy etc see it see it move on I have to say that Rob need to move past the Adam Sandler part of his life and get out of the Adam Sandler plot there be funny part in the whole movie I could even finish the last minute I be get bored the Animal be an alright film I do usually enjoy Adam Sandler film that have the same plot but this be try too hard to impress the joke be very old so trust I this be not a film that most people could really get into but some do so I be'

In [7]:
# save this data to be used in next notebooks

# labtrain.to_pickle("train_review")
# test.to_pickle("test_review")

## 2. Bag of words
​
* TFIDF matrice
* count matrice
* classification model of your choice
​

In [8]:
def bow_matrix(train,vectorizer=None,**kwargs):
    matrices=["cv","tfidf"]
    if vectorizer not in matrices:
        raise ValueError("vectorizer takes cv or tfidf values")
    if vectorizer =="tfidf":
        vec=TfidfVectorizer(**kwargs)
    else:
        vec=CountVectorizer(**kwargs)
    vec=vec.fit(train)
    return vec.transform(train), vec
    

def bow_predict(train,test,v):
# try other models
#     model=SVC(C=1, max_iter=400 )
#     model=LogisticRegression(max_iter=400)
#     model=rforest()

    model=SGDClassifier(loss='modified_huber')
    model.fit(train,y)
    pred=model.predict(v.transform(test))
    return pred, model

In [9]:
bowx, vectorizer=bow_matrix(xtrain,vectorizer="cv", stop_words="english",
                            token_pattern='\w{3,}', ngram_range=(1, 4),
                            max_df=0.9, min_df=2, max_features = 200000)

pred, model=bow_predict(bowx,xtest,vectorizer)

In [10]:
cm(y, model.predict(bowx))

array([[12500,     0],
       [    0, 12500]])

In [11]:
# submit prediction
# pd.DataFrame({"id":test.id, "sentiment":pred}).to_csv('submission.csv',index=False, header=True)

**LB score:**   

AUC = 0.85

<br>
<br>

# **Review Vectorization**
<br>

Convert each review to a vector of size 300, basically, it's the average of word embeddings vectors available on scapy

In [12]:
%%time
def get_vector(x):
    for doc in nlp.pipe(x):
        yield doc.vector
        
wordvec = pd.DataFrame(np.stack(list(get_vector(xtrain))))
test_wordvec=pd.DataFrame(np.stack(list(get_vector(xtest))))

CPU times: user 11min 44s, sys: 5.66 s, total: 11min 50s
Wall time: 11min 50s


In [13]:
%%time
model=LogisticRegression(max_iter=400)
model.fit(wordvec,y)
pred=model.predict(test_wordvec)

CPU times: user 8.07 s, sys: 2.77 s, total: 10.8 s
Wall time: 3.19 s


In [14]:
# submission
# pd.DataFrame({"id":test.id, "sentiment":pred}).to_csv('submission.csv',index=False, header=True)

**LB score:**   

AUC = 0.84