In [1]:
# Importando as bibliotecas
import numpy as np
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #loving = love
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
#import dask.dataframe as dd
#import dask.array as da
#import dask.delayed as dl
%matplotlib inline

In [None]:
# downloading stopwords

nltk.download('stopwords') # stopwords são preposições, 'this', 'that',...

In [2]:
train = pd.DataFrame.from_csv('train.tsv', sep='\t')
test = pd.DataFrame.from_csv('test.tsv', sep='\t')
y_train = train['Sentiment']
PhraseId = test.index.values

  """Entry point for launching an IPython kernel.
  


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156060 entries, 1 to 156060
Data columns (total 3 columns):
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [4]:
train.head(10)

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2
6,1,of escapades demonstrating the adage that what...,2
7,1,of,2
8,1,escapades demonstrating the adage that what is...,2
9,1,escapades,2
10,1,demonstrating the adage that what is good for ...,2


# Preparing the training set

In [None]:
# corpus for training set
corpus_train = [] # Initializing an empty list
for i in range(0, len(train)):
    review = re.sub('[^a-zA-Z]', ' ', train.iloc[i, 1]) # Replacing non letters with empty spaces
    review = review.lower() # Getting lowers
    review = review.split() # Splitting on spaces - creating a vector
    ps = PorterStemmer() 
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # Iterating over vector excluding stopwords
    review = ' '.join(review) # to string
    corpus_train.append(review) # append on corpus

I am gonna save this corpus into a pickle file, because this process is computacionally expensive

In [5]:
# to load from file
f = open("corpus_train.pickle","rb")
corpus_train = pickle.load(f)
f.close()

In [6]:
print(np.array(corpus_train).shape)
print(corpus_train[:20])

(156060,)
['seri escapad demonstr adag good goos also good gander occasion amus none amount much stori', 'seri escapad demonstr adag good goos', 'seri', '', 'seri', 'escapad demonstr adag good goos', '', 'escapad demonstr adag good goos', 'escapad', 'demonstr adag good goos', 'demonstr adag', 'demonstr', 'adag', '', 'adag', 'good goos', '', 'good goos', '', 'good goos']


# This is the part where we have to vectorize our corpus

In [None]:
#corpus_train = pd.DataFrame(corpus_train)
#corpus_train = dd.from_pandas(corpus_train, npartitions=4)

In [8]:
# Choose vectorize method

# "tfidf", "cv" or "hv"
vectorize_method = "hv"

In [9]:
if (vectorize_method == "tfidf"):
    tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False,
                        sublinear_tf=True, max_features=6918)
    corpus_train = tfidf.fit_transform(corpus_train)
elif (vectorize_method == "cv"):
    cv = CountVectorizer() 
    corpus_train = cv.fit_transform(corpus_train)
elif (vectorize_method == "hv"):
    hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 14) 
    corpus_train = hv.transform(corpus_train)

In [11]:
import sys
sys.getsizeof(corpus_train)

56

In [12]:
corpus_train.shape

(156060, 16384)

# Dimensionality Reduction

In [None]:
#tsvd = TruncatedSVD(n_components = 6918, random_state=42)
#corpus_train = tsvd.transform(corpus_train)

# Preparing the test set

In [None]:
# corpus for test set

corpus_test = [] # Initializing an empty list
for i in range(0, len(test)):
    review = re.sub('[^a-zA-Z]', ' ', test.iloc[i, 1]) # Replacing non letters with empty spaces
    review = review.lower() # Getting lowers
    review = review.split() # Splitting on spaces - creating a vector
    ps = PorterStemmer() 
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # Iterating over vector excluding stopwords
    review = ' '.join(review) # to string
    corpus_test.append(review) # append on corpus

In [13]:
# to load from file
f = open("corpus_test.pickle","rb")
corpus_test = pickle.load(f)
f.close()

In [14]:
print(np.array(corpus_test).shape)
print(corpus_test[0:10])

(66292,)
['intermitt pleas mostli routin effort', 'intermitt pleas mostli routin effort', '', 'intermitt pleas mostli routin effort', 'intermitt pleas mostli routin', 'intermitt pleas', 'intermitt pleas', 'intermitt', 'pleas', '']


In [None]:
#corpus_test = pd.DataFrame(corpus_test)
#corpus_test = dd.from_pandas(corpus_test, npartitions=4)

In [15]:
# Creating the bag of words model and vectorizing

if (vectorize_method == "tfidf"):
    tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False,
                            sublinear_tf=True)
    corpus_test = tfidf.fit_transform(corpus_test)
elif (vectorize_method == "cv"):
    cv = CountVectorizer() 
    corpus_test = cv.fit_transform(corpus_test)
elif (vectorize_method == "hv"):
    hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 14) 
    corpus_test = hv.transform(corpus_test)

In [None]:
#corpus_test = da.from_array(corpus_test, chunks=(1000, 1000))

In [16]:
corpus_test.shape

(66292, 16384)

# Choosing hyperparameters for the classifiers 

In [17]:
# Definning params for classifiers

params_rf = {'n_estimators': 100,
             'criterion': 'entropy',
             'n_jobs': 2,
             'random_state': 42,
             'verbose': 2,
             'max_features': 0.2,
             'min_samples_leaf': 5
}

params_etc = {'n_estimators': 100,
              'criterion': 'entropy',
              'max_depth': None,
              'min_samples_split': 2,
              'min_samples_leaf': 1,
              'n_jobs': 2,
              'random_state': 42,
              'verbose': 2
}

params_ada = {'n_estimators': 100,
              'learning_rate': 1,
              'random_state': 42,
             }

params_gtb = {'loss': 'deviance',
              'learning_rate': 1,
              'n_estimators': 100,
              'random_state': 42,
              'verbose': 2
}

params_lr = {'multi_class': 'multinomial',
             'random_state': 42,
              'verbose': 2,
              'n_jobs': 2,
             'solver': 'sag'
}


# Creating the classifiers

In [18]:
#Creating the classifiers
clf_rf = RandomForestClassifier(**params_rf)
clf_etc = ExtraTreesClassifier(**params_etc)
clf_ada = AdaBoostClassifier(**params_ada)
clf_gtb = GradientBoostingClassifier(**params_gtb)
clf_mnb = MultinomialNB()
clf_bnb = BernoulliNB()
clf_lr = LogisticRegression(**params_lr)
clf_xgb = XGBClassifier()

# Ensembling the models

In [None]:
clf_ens = VotingClassifier(estimators=[('rf', clf_rf), 
                                       ('etc', clf_etc), 
                                       ('ada', clf_ada), 
                                       ('gtb', clf_gtb),
                                       ('mnb', clf_mnb),
                                       ('bnb', clf_bnb),
                                       ('lr', clf_lr),
                                       ('xgb', clf_xgb)],
                           voting='hard',
                           n_jobs = 2)

clf_ens.fit(corpus_train, y_train)

building tree 1 of 100
building tree 2 of 100
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100


# Creating the submission file

In [None]:
#pd.DataFrame({"PhraseId": PhraseId, "Sentiment": clf_ens.predict(corpus_test)}).to_csv("./results/results_ens.csv", index=None)
PhraseId = test.index.values
pd.DataFrame({"PhraseId": PhraseId, 
              "Sentiment": clf_ens.predict(corpus_test)}).to_csv("./results/results_ens.csv", 
                                                                index=None)

# Evaluating the models

In [None]:
acc = accuracy_score(y_train, clf_ens.estimators_[3].predict(corpus_train))
acc

In [None]:
for i in range(4):
    pd.DataFrame({"PhraseId": PhraseId, 
                  "Sentiment": clf_ens.estimators_[i].predict(corpus_test)}).to_csv("./results/results_{}.csv".format(i), 
                                                                                    index=None)