# General Setting
## Data, Variables and configuration

for the entire project we want to keep same configuration among experiments, the following variables will be used for our diffrent algorithms

we will use 5 fold cross validation, along with stratified to each fold with balanced labels

In [None]:
from sklearn.model_selection import StratifiedKFold

from Data import get_raw_data

k_folds = 3
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=1)

raw_path = "Data\\sentiment_raw.csv"

labels = {'pos': 1, 'neg': 0}

x_raw, y_raw = get_raw_data(raw_path,labels)

# A. Traditional Algorithm

use traditional text minning and machine learning and find best sentiment classifier

we choose 6 most popular classification algorithm, we used the same baseline feature extraction and selection for all of them, the features are tfidf with standard configuration and are selected the best 50% features using chi^2 estimation
Results:

the Linear SVM showed the best results

## A.1. Find best feature extractor - Tfidf VS Count

Tfidf won with 0.894465 vs 0.852758

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

#the models we will evaluate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from Evaluation import evaluate_classifiers

classifiers = {
    "Nearest Neighbors" : KNeighborsClassifier(10),    
    "Decision Tree" : DecisionTreeClassifier(max_depth=5,min_samples_split=20, random_state=1),
    "Naive Bayes" : GaussianNB(),
    "AdaBoost" : AdaBoostClassifier(),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1),
    "SGD" : SGDClassifier(loss = 'log', alpha = 0.00001,penalty = 'l2', n_iter = 50, random_state=1),
    "Linear SVM" : SVC(kernel='linear', probability=True,random_state=1)
    }
    
#eval tfidt
basic_tfidf = TfidfVectorizer()
tfidf_pipe = Pipeline([('extractor', basic_tfidf)])
eval_tfidf = evaluate_classifiers(x_raw, y_raw,labels,classifiers,skf,tfidf_pipe, False,True,"eval tfidf vector")

#eval count
basic_count = CountVectorizer()
count_pipe = Pipeline([('extractor', basic_count)])
eval_count = evaluate_classifiers(x_raw, y_raw,labels,classifiers,skf,count_pipe,False, True,"eval count vector")

In [None]:
#compare
eval_tfidf = eval_tfidf.sort(columns = ['AUC'], axis =0, ascending  = False)
eval_count = eval_count.sort(columns = ['AUC'], axis =0, ascending  = False)

tfidf_auc = eval_tfidf["AUC"][0]
count_auc = eval_count["AUC"][0]

best_extractor = {}
if  tfidf_auc > count_auc :
    best_extractor["vector"] = basic_tfidf
    best_extractor["name"] = "tfidf"
    best_extractor["score"] = tfidf_auc
else:
    best_extractor["vector"] = basic_count
    best_extractor["name"] = "count"
    best_extractor["score"] = count_auc

best_extractor["vector"]

In [None]:
best_classifier = SVC(kernel='linear', probability=True,random_state=1)

## optimize tfidf for baseline

In [None]:
# optimize extraction
from sklearn.pipeline import Pipeline
from Data import Tokenizer, Tokenizer_stemmer
from Evaluation import get_best_param_search
import numpy as np

np.random.seed(0)
pipe = Pipeline([
        ('extractor', best_extractor["vector"]),
        ('clf', best_classifier),
    ])

params  = {
    'extractor__tokenizer': (Tokenizer(), Tokenizer_stemmer()),
    'extractor__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
    'extractor__lowercase' : (True, False),
    'extractor__stop_words' : (None, "english"),
    }


tfidf_extract_search = get_best_param_search(x_raw,y_raw,pipe,params,"extract")



if (tfidf_extract_search.best_score_ > best_extractor["score"] ):
    best_extractor["vector"] = tfidf_extract_search.best_estimator_.named_steps['extractor']
    best_extractor["name"] = "tfidf_extract"
    best_extractor["score"] = tfidf_extract_search.best_score_

# optimize extraction

np.random.seed(0)
pipe = Pipeline([
        ('extractor', best_extractor["vector"]),
        ('clf', best_classifier),
    ])

if (tfidf_extract_search_ngram.best_score_ > best_extractor["score"]):
    best_extractor["vector"] = tfidf_extract_search_ngram.best_estimator_.named_steps['extractor']
    best_extractor["name"] = "tfidf_ngram_extract"
    best_extractor["score"] = tfidf_extract_search_ngram.best_score_



params  = {
    'extractor__tokenizer': (Tokenizer(), Tokenizer_stemmer()),
    'extractor__ngram_range': ((1, 1), (1, 2),(1,3)), # unigrams or bigrams
    'extractor__lowercase' : (True, False),
    'extractor__stop_words' : (None, "english"),
    }


tfidf_extract_search_ngram = get_best_param_search(x_raw,y_raw,pipe,params,"extract")

In [None]:
# optimize tdif words selection
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from Data import union_param
from Evaluation import evaluate_vocabulary

np.random.seed(0)

pipe = Pipeline([
        ('extractor',TfidfVectorizer()) ,
        ('clf', best_classifier),
    ])

params  = {
    'extractor__max_df': (0.5, 0.7, 1.0),
    'extractor__min_df': (0.005, 0.01,0.5, 1),
    'extractor__max_features': (500, 1500,2000,None),
    }

params = union_param(params,tfidf_extract_search.best_params_)
tfidf_extract_select_search = get_best_param_search(x_raw,y_raw,pipe,params,"tfidf_extract_select")



if (tfidf_extract_select_search.best_score_ > best_extractor["score"]):
    best_extractor["vector"] = tfidf_extract_select_search.best_estimator_.named_steps['extractor']
    best_extractor["name"] = best_extractor["name"] + "_extract_idfs"
    best_extractor["score"] = tfidf_extract_search.best_score_



from sklearn.pipeline import Pipeline

np.random.seed(0)

pipe = Pipeline([
        ('extractor', best_extractor["vector"]),
        ('clf', best_classifier),
    ])

params  = {
    'extractor__norm': (None, 'l2'),
    'extractor__sublinear_tf': (True,False),
    }

#params = union_param(params,tfidf_extract_select_search.best_params_)
tfidf_extract_select_transform_search = get_best_param_search(x_raw,y_raw,pipe,params,"tfidf_extract_select_transform")

if (tfidf_extract_select_transform_search.best_score_ > best_extractor["score"] ):
    best_extractor["vector"] = tfidf_extract_select_transform_search.best_estimator_.named_steps['extractor']
    best_extractor["name"] = best_extractor["name"] + "_transform"
    best_extractor["score"] = tfidf_extract_search.best_score_


emoticons = []
with open("Data\\emoticons.txt") as f:
    for line in f:
        emoticons.append(line.split()[0])
        
print(len(best_extractor["vector"].vocabulary_))
eval_vec1, tf_emotic_vec_enrich = evaluate_vocabulary(x_raw,y_raw, labels, best_classifiers,skf,
                                    False, emoticons, best_extractor["vector"],"tf_emotic_enrich",False)
    
eval_vec2, tf_emotic_vec_Replace = evaluate_vocabulary(x_raw,y_raw, labels, best_classifiers,skf,
                                    False, emoticons, best_extractor["vector"],
                                    "tf_emotic_enrich",True)

enrich_auc = eval_vec1["AUC"][0]
replace_auc = eval_vec2["AUC"][0]

if  enrich_auc > best_extractor["score"] :
    best_extractor["vector"] = tf_emotic_vec_enrich
    best_extractor["name"] = best_extractor["name"] + "_enrich"
    best_extractor["score"] = enrich_auc

if  replace_auc > best_extractor["score"] :
    best_extractor["vector"] = tf_emotic_vec_Replace
    best_extractor["name"] = best_extractor["name"] + "_replace"
    best_extractor["score"] = replace_auc



## A.2 evaluate feature selection methods Chi^2 VS PMI

chi wom with 0.891369 vs 0.874022 but worse the none


In [None]:
# optimize feature selection
pipe = Pipeline([
        ('extractor', best_extractor["vector"]),
        ('selector', SelectPercentile()),
        ('clf', best_classifier),
    ])
params = {
    'selector__score_func': (chi2,f_classif, mutual_info_classif),
    'selector__percentile': (10, 25,50,75,100),
    }

params = union_param(params,tfidf_extract_select_search.best_params_)

tfidf_extract_select_transform_search_fs = get_best_param_search(x_raw,y_raw,pipe,params,"tfidf_extract_select_transform_fs")

if (tfidf_extract_select_transform_search_fs.best_score_ > best_extractor["score"] ):
    best_extractor["vector"] = tfidf_extract_select_transform_search_fs.best_estimator_.named_steps['extractor']
    best_extractor["selector"] = tfidf_extract_select_transform_search_fs.best_estimator_.named_steps['selector']
    best_extractor["name"] = best_extractor["name"] + "_fs"
    best_extractor["score"] = tfidf_extract_select_transform_search_fs.best_score_

## A.3 compare topic analysis algorithms - LDA VS LSA
use topic analysis as feature selection using vocabulary

LDA wom with 0.892090 vs 0.874022 but worse the none

In [None]:
# tain LDA on best tf piipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.externals import joblib

from Evaluation import print_topics

n_topics =20

select_LDA = LatentDirichletAllocation(n_topics=n_topics,
                                          max_iter=100,
                                          learning_method= 'batch',
                                          random_state=0)       

pipeline_lda = Pipeline([
        ('extractor', best_extractor["vector"]),
        ('clf', select_LDA),
    ])

x_transformed = pipeline_lda.fit_transform(x_raw,y_raw )
joblib.dump(pipeline_lda, pickl_path + 'pipeline_lda.pkl')



print_topics(n_topics, pipeline_lda,False)

In [None]:
# tain LDA on best tf piipeline
from Evaluation import get_top_words

n_topics =20
select_LDA = LatentDirichletAllocation(n_topics=n_topics,
                                          max_iter=100,
                                          learning_method= 'batch',
                                          random_state=0)       

pipeline_lda = Pipeline([
        ('extractor', CountVectorizer(stop_words = "english", min_df = 20)),
        ('clf', select_LDA),
    ])

x_transformed = pipeline_lda.fit_transform(x_raw,y_raw )
joblib.dump(pipeline_lda, pickl_path + 'pipeline_lda.pkl')



print_topics(n_topics, pipeline_lda,False)



LDA_top_words = get_top_words(pipeline_lda, 1000,False,1)
LDA_top_words

In [None]:
# LSI topic analysis
n_components = 20
select_LSI = TruncatedSVD(n_components=n_components, n_iter=7, random_state=42)
tfidf_lsi_pipe = Pipeline([('extractor', best_extractor["vector"]),
                            ('clf', select_LSI)
                               ])
x_transformed = tfidf_lsi_pipe.fit_transform(x_raw,y_raw)
joblib.dump(tfidf_lsi_pipe, pickl_path + 'tfidf_lsi_pipe.pkl')

print_topics(n_components, tfidf_lsi_pipe,False)

LSI_top_words = get_top_words(tfidf_lsi_pipe, 1000,False,13)
LSI_top_words

In [None]:
# optimize tdif selection

lsi_vec = evaluate_vocabulary(x_raw,y_raw, labels, best_classifiers,skf,False,LSI_top_words,
                                  tfidf_extract_select_transform_search.best_estimator_.named_steps['extractor'],
                                  "tf_emotic_replace",
                                  True)

lda_vec = evaluate_vocabulary(x_raw,y_raw, labels, best_classifiers,skf,False,LDA_top_words,
                                  tfidf_extract_select_transform_search.best_estimator_.named_steps['extractor'],
                                  "tf_emotic_replace",
                                  True)



df = pd.DataFrame(tfidf_extract_select_transform_search_fs.cv_results_ )
df
mplt = df.plot(x = "param_selector__score_func" , y = "mean_test_score" )
mplt.set_xlabel("func")
mplt.set_ylabel("score")
plt.show()

# 3. Advanced Algorithm
in this section we will try to improve the feature selection using the combination of LDA and Word2Vec
general explenation:
Word2Vec feature selection we will use word2vec algorithms to find words that are sentimential relevant to the label.
LDA with word2vec selection we will run LDA on the selected word and try to find topics that relevant to a sentiment label.
classification with word2vec selection we will train a LDA classifier and the optimized model and evaluate it

## 3.1 use Word2Vec
we used word2vec model to search for relevant sentiment words and use them as features:
1. we used 2 w2v models:
1.1 sentiment model trained on the given dataset
1.2 google model on wikipedia
2."semi - unsupervised" word seach: in each step we collect the words into one vocabulary that will be used for feature selection
2.1 pos and neg lists: from the web
2.2 thershold for naive select: used for picking similar words to 'good' and 'bad
2.3 best words / naive select: use threshold to find the most similar words to the words 'good' and 'bad'
2. enrich words / advance:
2.4.1 Tiear 2: for top most similar words also claculate most similar words (a tier 2)
2.4.2 Negative similarity: because the labels has negative-positive relation we also used the positive - negative similarityto to find a similar word to the oposit label!
3. "supervised" word seach: for the k most informetial words we calculate 10 "best words"


In [None]:
#w2v sentiment model
#trained on the given dataset
import gensim
sentences = []
with open('Data\\train_unsup.txt') as f:
    for line in f:
        sentences.append(line.split())
model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
model.save('Models\\sentimentModel')


In [None]:
# pos and neg lists

#pos and neg lists: we used positive and negative word lists form: http://ptrckprry.com/course/ssd/
import gensim
import pandas as pd
from Data import extract_calc_words_from_file

sentiment2vec  = gensim.models.Word2Vec.load('Models\\sentimentModel')
neg_words = extract_calc_words_from_file('Data\\negative_words.txt',
                                       sentiment2vec,
                                       ('good','bad')
                                       )
pos_words = extract_calc_words_from_file('Data\\positive_words.txt',
                                       sentiment2vec,
                                       ('good','bad')
                                     )
all_words = {} 
all_words.update(pos_words)
all_words.update(neg_words)


neg_df = pd.DataFrame(neg_words).transpose()
print('Negative words statistics')
neg_stats = neg_df.describe()
print(neg_stats)

pos_df = pd.DataFrame(pos_words).transpose()
print('Positive words statistics')
pos_stats = pos_df.describe()
print(pos_stats)

print ("all words collected from docs = " + str(len(all_words)))

find similarity threshold

finiding thershold for picking similar ward for 'good' and 'bad'
for each list we calculate the distance from the words 'good' and 'bad'
for each list we calculate the mean and std distance from words


In [None]:
neg_sim_threshold = neg_stats['bad_sim']['mean'] + neg_stats['bad_sim']['std']
pos_sim_threshold = pos_stats['good_sim']['mean'] + pos_stats['good_sim']['std']

## words 1 - find Best Words¶

best words: we used the mean + std distance as a threshold to find the most similar words to the words 'good' and 'bad'

In [None]:
from Evaluation import find_best_words

words_for_sim = ('good','bad')
pos_best_words = {}
neg_best_words = {}

most_sim = sentiment2vec.most_similar('good', topn = 750)
pos_best_words.update(find_best_words(sentiment2vec,most_sim,'good',pos_sim_threshold,words_for_sim,all_words))
all_words.update(pos_best_words)

most_sim = sentiment2vec.most_similar('bad', topn = 750)
neg_best_words.update(find_best_words(sentiment2vec,most_sim,'bad',neg_sim_threshold,words_for_sim,all_words))
all_words.update(neg_best_words)

print ("all words after simple w2v = " + str(len(all_words)))

## words 2 - enrich word

enrich words1: for top most similar words with similarity higher than mean + 2*std we also claculate most similar words enrich words2: because the labels has negative-positive relation we also used the positive - negative similarityto to find a similar word to the oposit label!

In [None]:
from Data import enrich_words

pos_enrich_words = enrich_words(pos_best_words,sentiment2vec,500,'good','bad',pos_sim_threshold*1.7,('good','bad'),all_words)
all_words.update(pos_enrich_words)
neg_enrich_words = enrich_words(neg_best_words,sentiment2vec,500,'bad','good',neg_sim_threshold*1.7,('good','bad'),all_words)
all_words.update(neg_best_words)

print ("all words after smart w2v = " + str(len(all_words)))

## words 3 - Supervised - best for k top words¶

In [None]:
from sklearn.feature_selection import SelectKBest

k_words = 100
n_best_words = 50

pipeline_select_k = Pipeline([
        ('extractor', CountVectorizer(stop_words = "english", min_df = 20)),
        ('selector', SelectKBest(score_func = mutual_info_classif, k = k_words )),
         ])

opimizer_maniulate = pipeline_select_k.fit_transform(x_raw, y_raw)

feature_names = pipeline_select_k.named_steps['extractor'].get_feature_names()
support = pipeline_select_k.named_steps['selector'].get_support()
feature_names = np.array(feature_names)[support]


supervised = {}
for select_word in feature_names:
    if select_word in sentiment2vec:
        top_words = sentiment2vec.most_similar(select_word, topn = n_best_words)
        for w , v in top_words:
            if w in sentiment2vec:
                supervised[w] = {}
                supervised[w]['good_sim'] = sentiment2vec.similarity('good',w) 
                supervised[w]['bad_sim'] = sentiment2vec.similarity('bad',w)


all_words.update(supervised)

print ("all words after supervised w2v = " + str(len(all_words)))

## words 4 - LDA with W2V

In [None]:
import sklearn.decomposition

n_topics = 20

select_LDA = sklearn.decomposition.LatentDirichletAllocation(n_topics=n_topics,
                                                             max_iter=100,
                                                             learning_method= 'batch',
                                                             random_state=0)

tfidf_lda_pipe = Pipeline([('extractor', CountVectorizer(stop_words = "english", min_df = 20, vocabulary = all_words.keys())),
                           ('clf', select_LDA)
                               ])


x_transformed = tfidf_lda_pipe.fit_transform(x_raw,y_raw)


print_topics(10, tfidf_lda_pipe,False)



LDA_top_words = get_top_words(tfidf_lda_pipe, 1000,False,3)
LDA_top_words

## classification with W2V

In [None]:
len(supervised)

# optimize tdif selection

vocabs = {
    "all" : all_words.keys(),
    "supervised" : supervised.keys(), 
    "negative" : neg_words.keys(),
     "positive" : pos_words.keys(),
    "best positive" : pos_best_words.keys(),
     "best negative" : neg_best_words.keys(),
     "enrich positive" : pos_enrich_words.keys(),
     "enrich negative" : neg_enrich_words.keys() ,
}

replace = False
evals_w2v = {}
len_w2v= {}
for name, vocab in vocabs.iteritems():
    eval_w2v, _ = evaluate_vocabulary(x_raw,y_raw, labels, classifiers,skf,False,vocab,best_extractor["vector"],"best_w2v_"+name,True)
    evals_w2v[name] =  eval_w2v["AUC"][0]
    len_w2v[name] =  len(vocab)

    
pd.DataFrame(evals_w2v,index=[0]).transpose()


# optimize feature selection

select_k_eval={}
for name, vocab in vocabs.iteritems():
    print("k compare to " + name + "size: ")
    print(len(vocab))
    pipe = Pipeline([
            ('extractor', best_extractor["vector"]),
            ('selector', SelectKBest(score_func = chi2, k = len(vocab))),
        ])
    eval_select = evaluate_classifiers(x_raw, y_raw,labels,classifiers,skf,pipe,False, True,"eval k select k = " + str(len(vocab)))
    select_k_eval[len(vocab)] = eval_select["AUC"][0]

for i  in [1.5,2,2.5,3,3.5,4,4.5,5]:
    print("k compare to all size: ")
    select = len(all_words)*i
    print(select)
    pipe = Pipeline([
            ('extractor', best_extractor["vector"]),
            ('selector', SelectKBest(score_func = chi2, k = select)),
        ])
    eval_select = evaluate_classifiers(x_raw, y_raw,labels,classifiers,skf,pipe,False, True,"eval k select k = " + str(select))
    select_k_eval[select] =  eval_select["AUC"][0]
    
    

pd.DataFrame(select_k_eval,index=[0]).transpose()

count_evals_w2v = {}
for name, vocab in vocabs.iteritems():
    print name
    c_eval_w2v, x_ = evaluate_vocabulary(x_raw,y_raw, labels,
                        classifiers,
                        skf,False,vocab,
                        basic_count,
                        "count_w2v_"+name,True)
    count_evals_w2v[name] =  c_eval_w2v["AUC"][0]

    
pd.DataFrame(count_evals_w2v,index=[0]).transpose()