In [1]:
# Importando as bibliotecas
import numpy as np
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #loving = love
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
#import dask.dataframe as dd
#import dask.array as da
#import dask.delayed as dl
%matplotlib inline

In [None]:
# downloading stopwords

nltk.download('stopwords') # stopwords são preposições, 'this', 'that',...

In [2]:
train = pd.DataFrame.from_csv('train.tsv', sep='\t')
test = pd.DataFrame.from_csv('test.tsv', sep='\t')
y_train = train['Sentiment']
PhraseId = test.index.values

  """Entry point for launching an IPython kernel.
  


In [None]:
train.shape

In [None]:
train.head(10)

# Preparing the training set

In [None]:
# corpus for training set
corpus_train = [] # Initializing an empty list
for i in range(0, len(train)):
    review = re.sub('[^a-zA-Z]', ' ', train.iloc[i, 1]) # Replacing non letters with empty spaces
    review = review.lower() # Getting lowers
    review = review.split() # Splitting on spaces - creating a vector
    ps = PorterStemmer() 
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # Iterating over vector excluding stopwords
    review = ' '.join(review) # to string
    corpus_train.append(review) # append on corpus

I am gonna save this corpus into a pickle file, because this process is computacionally expensive

In [2]:
# to load from file
f = open("corpus_train.pickle","rb")
corpus_train = pickle.load(f)
f.close()

In [None]:
print(np.array(corpus_train).shape)
print(corpus_train[:20])

# This is the part where we have to vectorize our corpus

In [None]:
#corpus_train = pd.DataFrame(corpus_train)
#corpus_train = dd.from_pandas(corpus_train, npartitions=4)

In [3]:
# Choose vectorize method

# "tfidf", "cv" or "hv"
vectorize_method = "hv"

In [4]:
if (vectorize_method == "tfidf"):
    tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False,
                        sublinear_tf=True, max_features = 1500)
    corpus_train = tfidf.fit_transform(X[0]).toarray()
elif (vectorize_method == "cv"):
    cv = CountVectorizer(max_features = 5000) 
    corpus_train = cv.fit_transform(corpus_train).toarray()
elif (vectorize_method == "hv"):
    hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 13) 
    corpus_train = hv.transform(corpus_train).toarray()

In [5]:
corpus_train.shape

(156060, 8192)

In [7]:
corpus_train = pd.DataFrame(corpus_train)

In [8]:
corpus_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Columns: 8192 entries, 0 to 8191
dtypes: float64(8192)
memory usage: 9.5 GB


# Dimensionality Reduction

In [None]:
pca = PCA(n_components = 800)
corpus_train = pca.fit_transform(corpus_train)

# Preparing the test set

In [None]:
# corpus for test set

corpus_test = [] # Initializing an empty list
for i in range(0, len(test)):
    review = re.sub('[^a-zA-Z]', ' ', test.iloc[i, 1]) # Replacing non letters with empty spaces
    review = review.lower() # Getting lowers
    review = review.split() # Splitting on spaces - creating a vector
    ps = PorterStemmer() 
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # Iterating over vector excluding stopwords
    review = ' '.join(review) # to string
    corpus_test.append(review) # append on corpus

In [None]:
# to load from file
f = open("corpus_test.pickle","rb")
corpus_test = pickle.load(f)
f.close()

In [None]:
print(np.array(corpus_test).shape)
print(corpus_test[0:10])

In [None]:
#corpus_test = pd.DataFrame(corpus_test)
#corpus_test = dd.from_pandas(corpus_test, npartitions=4)

In [None]:
# Creating the bag of words model and vectorizing

if (vectorize_method == "tfidf"):
    tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False,
                            sublinear_tf=True, max_features = 1500)
    corpus_test = tfidf.fit_transform(corpus_test).toarray()
elif (vectorize_method == "cv"):
    cv = CountVectorizer(max_features = 5000) 
    corpus_test = cv.fit_transform(corpus_test).toarray()
elif (vectorize_method == "hv"):
    hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 13) 
    corpus_test = hv.transform(corpus_test).toarray()

In [None]:
#corpus_test = da.from_array(corpus_test, chunks=(1000, 1000))

In [None]:
corpus_test.shape

# Choosing hyperparameters for the classifiers 

In [None]:
# Definning params for classifiers

params_rf = {'n_estimators': 50,
             'criterion': 'entropy',
             'n_jobs': 2,
             'random_state': 42,
             'verbose': 2,
             'max_features': 0.2,
             'min_samples_leaf': 5
}

params_etc = {'n_estimators': 50,
              'criterion': 'entropy',
              'max_depth': None,
              'min_samples_split': 2,
              'min_samples_leaf': 1,
              'n_jobs': 2,
              'random_state': 42,
              'verbose': 2
}

params_ada = {'n_estimators': 50,
              'learning_rate': 1,
              'random_state': 42,
             }

params_gtb = {'loss': 'deviance',
              'learning_rate': 1,
              'n_estimators': 50,
              'random_state': 42,
              'verbose': 2
}


# Creating the classifiers

In [None]:
#Creating the classifiers
clf_rf = RandomForestClassifier(**params_rf)
clf_etc = ExtraTreesClassifier(**params_etc)
clf_ada = AdaBoostClassifier(**params_ada)
clf_gtb = GradientBoostingClassifier(**params_gtb)
#clf_mnb = MultinomialNB()
#clf_bnv = BernoulliNB()
#clf_svm = SVC(kernel="rbf", random_state = 42)

In [None]:
#clf_svm.fit(corpus_train, train["Sentiment"])
#pd.DataFrame({"PhraseId": PhraseId, 
#              "Sentiment": clf_svm.predict(corpus_test)}).to_csv("./results/results_svm.csv", 
#                                                                 index=None)

In [None]:
clf_rf.fit(corpus_train, train["Sentiment"])

In [None]:
#f = open("clf_rf.pickle", "wb")
#pickle.dump(clf_rf, f)
#f.close()

# Ensembling the models

In [None]:
clf_ens = VotingClassifier(estimators=[('rf', clf_rf), 
                                       ('etc', clf_etc), 
                                       ('ada', clf_ada), 
                                       ('gtb', clf_gtb)],
                           voting='hard',
                           n_jobs = 2)

clf_ens.fit(corpus_train.compute(), y_train)

# Creating the submission file

In [None]:
#pd.DataFrame({"PhraseId": PhraseId, "Sentiment": clf_ens.predict(corpus_test)}).to_csv("./results/results_ens.csv", index=None)
PhraseId = test.index.values
pd.DataFrame({"PhraseId": PhraseId, 
              "Sentiment": clf_ens.predict(corpus_test)}).to_csv("./results/results_ens.csv", 
                                                                index=None)

# Evaluating the models

In [None]:
acc = accuracy_score(y_train, clf_ens.estimators_[3].predict(corpus_train))
acc

In [None]:
for i in range(4):
    pd.DataFrame({"PhraseId": PhraseId, 
                  "Sentiment": clf_ens.estimators_[i].predict(corpus_test)}).to_csv("./results/results_{}.csv".format(i), 
                                                                                    index=None)