In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import heapq
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

In [None]:
def get_sample(ds, field, num=50):
    ds_train = ds[ds[field]==1].sample(num)
    ds_train = ds_train.append(ds[ds[field]==0].sample(num))
    ds_train = ds_train.append(ds[ds[field]==-1].sample(num))
    ds_train.shape
    return ds_train

In [None]:
def build_model(X, y):
    model = Pipeline([('vect', CountVectorizer())
                        ,('tfidf', TfidfTransformer())
                        ,('clf', MultinomialNB()),
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
    model = model.fit(X_train, y_train)
    return model

In [28]:
def benchmark(clf,X_train, X_test, y_train, y_test):
    #print('_' * 80)
    #print("Training: ")
    #print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    #print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    #print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    #print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        #print("dimensionality: %d" % clf.coef_.shape[1])
        #print("density: %f" % density(clf.coef_))

        if False and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        #print()

    if False:
        #print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=target_names))

    if False:
        #print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    #print()
    clf_descr = str(clf).split('(')[0]
    return clf, clf_descr, score, train_time, test_time, pred

In [24]:
def benchmark_models(X_train, X_test, y_train, y_test, vectorizer, path):  
    
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    results = []

    for penalty in ["l2", "l1"]:
        print('=' * 80)
        print("%s penalty" % penalty.upper())
        # Train Liblinear model
        results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                                dual=False, tol=1e-3),
                                X_train, X_test, y_train, y_test))

        # Train SGD model
        results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                               penalty=penalty), 
                                X_train, X_test, y_train, y_test))

    # Train SGD with Elastic Net penalty
    #print('=' * 80)
    #print("Elastic-Net penalty")
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty="elasticnet"),
                            X_train, X_test, y_train, y_test))

    # Train sparse Naive Bayes classifiers
    #print('=' * 80)
    #print("Naive Bayes")
    results.append(benchmark(MultinomialNB(alpha=.01),
                            X_train, X_test, y_train, y_test))
    results.append(benchmark(MultinomialNB(),
                            X_train, X_test, y_train, y_test))
    results.append(benchmark(BernoulliNB(alpha=.01),
                            X_train, X_test, y_train, y_test))


    #plot_scores(results,path)
    
    return results

In [None]:
def plot_scores(results,path):
    
    # make some plots
    indices = np.arange(len(results))
    results = [[x[i] for x in results] for i in range(6)]

    clfs, clf_names, score, training_time, test_time, preds = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)

    plt.figure(figsize=(12, 8))
    plt.title("Score")
    plt.barh(indices, score, .2, label="score", color='navy')
    plt.barh(indices + .3, training_time, .2, label="training time",
             color='c')
    plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
    plt.yticks(())
    plt.legend(loc='best')
    plt.subplots_adjust(left=.25)
    plt.subplots_adjust(top=.95)
    plt.subplots_adjust(bottom=.05)

    for i, c in zip(indices, clf_names):
        plt.text(-.3, i, c)
        
    plt.savefig(path, format='eps')

In [None]:
def test_sample(model):
    print('evaluating sample data')
    docs_new = ['i agree with you', 'i disagree with you']
    predicted = model.predict(docs_new)

    for doc, stance in zip(docs_new, predicted):
        print('%r => %s' % (doc, stance))

In [None]:
def calc_stats(clf_names, preds):
    print('calculating f-scores')
    ds_train['stance_pred'] = model.predict(ds_train.text)
    types = ds_train.groupby(['type'])

    for name, group in types:
        #TODO: use only pos and neg inside groups
        fscore=metrics.f1_score(group.stance, group.stance_pred, average='micro') 
        #f1_macro=metrics.f1_score(group.stance, group.stance_pred, labels=[-1,1], average='macro') 
        #print(name, fscore)
        ds_train.loc[ds_train.type==name, 'fscore_nb'] = fscore
        #ds_train.loc[ds_train.type==name, 'fscore_macro'] = f1_macro

    f1_micro=metrics.f1_score(ds_train.stance, ds_train.stance_pred, average='micro') 
    f1_macro=metrics.f1_score(ds_train.stance, ds_train.stance_pred, average='macro') 

    ds_train['fscore_nb_micro'] = f1_micro
    ds_train['fscore_nb_macro'] = f1_macro
    
    fscores = ds_train.groupby('type').agg({'fscore_nb': 'mean'})
    fscores = fscores.reset_index()
    fscores.rename(columns={'fscore_nb': 'NB'}, inplace=True)
    fscores
    # fscores.loc[fscores.shape[0]] = ['F micro' , ds_train.fscore_nb_micro[0]]
    # fscores.loc[fscores.shape[0]] = ['F macro' , ds_train.fscore_nb_macro[0]]
    fscores['type'] = fscores['type'].str.replace('_', ' ')
    #fscores.to_csv('../results/fscores.csv', index=False)
    #print(fscores)
    
    fmscores = ds_train[['fscore_nb_micro', 'fscore_nb_macro']].mean()
    fmscores = fmscores.reset_index(name='NB')
    f2 = ds_train[['fscore_nb_micro', 'fscore_nb_macro']].mean()
    fmscores['index'] = fmscores['index'].str.replace('fscore_nb_' ,'F ')
    fmscores['alg2'] = f2.values
    fmscores.rename(columns={'index':'F score'}, inplace=True)
    #fmscores.to_csv('../results/fmscores.csv', index=False)
    #print(fmscores)
    return fscores, fmscores

In [None]:
def benchmark_stats(X_test, y_test, results):
    print('calculating f-scores')
    ds_train = X_test.copy()
    ds_train['y_test'] = y_test
    print(y_test.shape)
    
    results = heapq.nlargest(5, results, key=lambda x: x[2])

    for r in results:
        clf_name =r[1]
        pred = r[5]
        ds_train[clf_name] = pred
    
    types = ds_train.groupby(['type'])
    micro_stats = []
    macro_stats = []

    for r in results:
        clf_name = r[1]
        for name, group in types:
            #TODO: use only pos and neg inside groups
            #print(clf_name, group[clf_name].shape)
            fscore=metrics.f1_score(group.y_test, group[clf_name], average='micro') 
            #f1_macro=metrics.f1_score(group.stance, group.stance_pred, labels=[-1,1], average='macro') 
            #print(name, fscore)
            
            stat_name='fscore_'+clf_name
            if not stat_name in micro_stats:
                micro_stats.append(stat_name)
            ds_train.loc[ds_train.type==name, stat_name] = fscore
            #ds_train.loc[ds_train.type==name, 'fscore_macro'] = f1_macro
            #print(len(ds_train.columns))

        f1_micro=metrics.f1_score(ds_train.y_test, ds_train[clf_name], average='micro') 
        f1_macro=metrics.f1_score(ds_train.y_test, ds_train[clf_name], average='macro') 
        stat_name='fscore_micro'+clf_name
        macro_stats.append(stat_name)
        ds_train[stat_name] = f1_micro
        stat_name='fscore_macro'+clf_name
        macro_stats.append(stat_name)
        ds_train[stat_name] = f1_macro
    
    micro_stats.append('type')
    fscores = ds_train[micro_stats].groupby('type').mean()
    fscores = fscores.reset_index()
    #fscores.rename(columns={'fscore_nb': 'NB'}, inplace=True)
    fscores['type'] = fscores['type'].str.replace('_', ' ')
    fscores['type'] = fscores['type'].str.replace('+', ' and ')
    cols = [c.replace('fscore_', '') for c in fscores.columns]
    fscores.columns = cols
    #print(len(fscores.columns))
    
    fmscores = ds_train[macro_stats].mean()
    fmscores = fmscores.reset_index()
    fmscores.columns = ['stat', 'value']
    fmscores['stat'] = fmscores['stat'].str.replace('fscore_micro' ,'F micro ')
    fmscores['stat'] = fmscores['stat'].str.replace('fscore_macro' ,'F macro ')
#     fmscores['alg2'] = f2.values
#     fmscores.rename(columns={'index':'F score'}, inplace=True)
    #fmscores.to_csv('../results/fmscores.csv', index=False)
    #print(fmscores)
    return fscores, fmscores


In [27]:
print('our english dataset...')
ds = pd.read_csv('../dataset/wiki/opinions_annotated.csv')
ds = ds[ds.lang=='en']

print('stance classification')
ds_train = get_sample(ds, 'stance')
X = ds_train[['text', 'type']]
y = ds_train.stance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
vectorizer = TfidfVectorizer()
results = benchmark_models(X_train.text, X_test.text, y_train, y_test, vectorizer, '../results/opinions_stance_score_en.eps')
fscores, fmscores = benchmark_stats(X_test, y_test, results)
#choose the best model
#model = build_model(X,y)
model = heapq.nlargest(1, results, key=lambda x: x[2])[0][0]
print('best model: ' + str(model))
#test_sample(model)
X = vectorizer.transform(ds.text.values)
#X = ds.text
predicted = model.predict(X)
ds['stance_pred'] = predicted
fscores.to_csv('../results/opinions_fscores_stance_en.csv', index=False)
fmscores.to_csv('../results/opinions_fmscores_stance_en.csv', index=False)

print('sentiment classification')
ds_train = get_sample(ds, 'sentiment', 8)
X = ds_train[['text', 'type']]
y = ds_train.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
vectorizer = TfidfVectorizer()
results = benchmark_models(X_train.text, X_test.text, y_train, y_test, vectorizer, '../results/opinions_sentiment_score_en.eps')
fscores, fmscores = benchmark_stats(X_test, y_test, results)
#choose the best model
#model = build_model(X,y)
model = heapq.nlargest(1, results, key=lambda x: x[2])[0][0]
print('best model: ' + str(model))
#test_sample(model)
X = vectorizer.transform(ds.text.values)
#X = ds.text
predicted = model.predict(X)
ds['sentiment_pred'] = predicted
fscores.to_csv('../results/opinions_fscores_sent_en.csv', index=False)
fmscores.to_csv('../results/opinions_fmscores_sent_en.csv', index=False)

ds.to_csv('../dataset/wiki/opinions_predicted_en.csv', index=False)


our english dataset...
stance classification
L2 penalty
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
L1 penalty
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,




best model: LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
sentiment classification
L2 penalty
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
L1 penalty
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0

  'precision', 'predicted', average, warn_for)


best model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [None]:
print('our spanish dataset...')
ds = pd.read_csv('../dataset/wiki/opinions_annotated.csv')
ds = ds[ds.lang=='es']


print('stance classification')
ds_train = get_sample(ds, 'stance', 50)
X = ds_train[['text', 'type']]
y = ds_train.stance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
vectorizer = TfidfVectorizer()
results = benchmark_models(X_train.text, X_test.text, y_train, y_test, vectorizer, '../results/opinions_stance_score_es.eps')
fscores, fmscores = benchmark_stats(X_test, y_test, results)
#choose the best model
#model = build_model(X,y)
model = heapq.nlargest(1, results, key=lambda x: x[2])[0][0]
print('best model: ' + str(model))
#test_sample(model)
X = vectorizer.transform(ds.text.values)
#X = ds.text
predicted = model.predict(X)
ds['stance_pred'] = predicted
fscores.to_csv('../results/opinions_fscores_stance_es.csv', index=False)
fmscores.to_csv('../results/opinions_fmscores_stance_es.csv', index=False)

print('sentiment classification')
ds_train = get_sample(ds, 'sentiment', 4)
X = ds_train[['text', 'type']]
y = ds_train.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
vectorizer = TfidfVectorizer()
results = benchmark_models(X_train.text, X_test.text, y_train, y_test, vectorizer, '../results/opinions_sentiment_score_es.eps')
fscores, fmscores = benchmark_stats(X_test, y_test, results)
#choose the best model
#model = build_model(X,y)
model = heapq.nlargest(1, results, key=lambda x: x[2])[0][0]
print('best model: ' + str(model))
#test_sample(model)
X = vectorizer.transform(ds.text.values)
#X = ds.text
predicted = model.predict(X)
ds['sentiment_pred'] = predicted
fscores.to_csv('../results/opinions_fscores_sent_es.csv', index=False)
fmscores.to_csv('../results/opinions_fmscores_sent_es.csv', index=False)

ds.to_csv('../dataset/wiki/opinions_predicted_es.csv', index=False)

In [None]:
print('aawd dataset...')
ds = pd.read_csv('../dataset/wiki/aawd_preprocessed.csv')
#ds = ds[ds.lang=='en']
ds_train = get_sample(ds, 'stance', 300)
X = ds_train[['text', 'type']]
print('stance classification')
y = ds_train.stance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
vectorizer = TfidfVectorizer()
results = benchmark_models(X_train.text, X_test.text, y_train, y_test, vectorizer, '../results/awwd_stance_score.eps')
fscores, fmscores = benchmark_stats(X_test, y_test, results)
#choose the best model
model = build_model(X=ds_train.text,y = ds_train.stance)
#model = heapq.nlargest(1, results, key=lambda x: x[2])
test_sample(model)
#X = vectorizer.transform(ds.text.values)
X = ds.text
predicted = model.predict(X)
ds['stance_pred'] = predicted

ds.to_csv('../dataset/wiki/aawd_predicted.csv', index=False)
fscores.to_csv('../results/aawd_fscores_stance.csv', index=False)
fmscores.to_csv('../results/aawd_fmscores_stance.csv', index=False)