In [604]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.dummy import DummyClassifier

load data

In [605]:
lang = 'en'
target = 'sentiment'
ds = pd.read_csv('../dataset/wiki/opinions_annotated.csv')
ds = ds[ds.lang==lang]

build train and test dataset for classification

In [606]:
stance_dist=ds.groupby(target).size()
stance_dist

sentiment
-1.0    259
 0.0    608
 1.0    134
dtype: int64

since the labels are balanced we take all of them

In [607]:
sample = ds[~ds[target].isnull()]
sample.shape

(1001, 31)

In [608]:
X = sample[['text', 'type']]
y = sample[target]

split in train and test dataset

In [609]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [610]:
print(f'train size: {X_train.shape} , test size: {X_test.shape}')

train size: (600, 2) , test size: (401, 2)


extract text features

In [611]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
X_train_vec = vectorizer.fit_transform(X_train.text)
X_test_vec = vectorizer.transform(X_test.text)
print(vectorizer)
print(f'text features: {X_train_vec.shape[1]}')

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
text features: 1267


build and evaluate classifiers

In [612]:
acc_scores = {}
X_test = X_test.copy()
X_test['y_test'] = y_test

def benchmark_model(name):
    print(clf.fit(X_train_vec,y_train))
    y_pred = clf.predict(X_test_vec)
    X_test.loc[:,name] = y_pred
    score = metrics.accuracy_score(y_test, y_pred)
    f1_micro = metrics.f1_score(y_test, y_pred, average='micro')
    f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
    acc_scores[name]=(name, score, f1_micro, f1_macro)
    print(f'score: {score}')

In [613]:
clf = DummyClassifier(strategy='stratified')
benchmark_model('random')

DummyClassifier(constant=None, random_state=None, strategy='stratified')
score: 0.47381546134663344


In [614]:
clf = MultinomialNB()
benchmark_model('NB')

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
score: 0.5910224438902744


  'precision', 'predicted', average, warn_for)


build SGD classifier

In [615]:
clf= SGDClassifier()
benchmark_model('SGD')

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
score: 0.57356608478803


In [616]:
clf = LinearSVC(class_weight='balanced')
benchmark_model('LinearSVC')

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
score: 0.5486284289276808


In [628]:
feature_names = vectorizer.get_feature_names()
if feature_names:
    feature_names = np.asarray(feature_names)

keywords=[]
labels = ['negative', 'neutral', 'positive']
print("top 10 keywords per class:")
for i, label in enumerate(labels):
    top10 = np.argsort(clf.coef_[i])[-10:]
    #print("%s: %s" % (label, " ".join(feature_names[top10])))
    keywords.append((label, " ".join(feature_names[top10])))
    
keywords

top 10 keywords per class:


[('negative',
  'seriously differently discussion delete disruptive according fraud article arguing does'),
 ('neutral', 'objected jfg 8213 page doing 2016 link changes versions rfc'),
 ('positive', 'jr class amusing thank friends fun tie seven concerned good')]

In [630]:
ds_keywords = pd.DataFrame(keywords, columns=['label', 'top 10 words'])
ds_keywords.to_csv(f'../results/keywords_{target}_{lang}.csv', index=False)
ds_keywords.head()

Unnamed: 0,label,top 10 words
0,negative,seriously differently discussion delete disrup...
1,neutral,objected jfg 8213 page doing 2016 link changes...
2,positive,jr class amusing thank friends fun tie seven c...


In [618]:
acc_ds = pd.DataFrame(list(acc_scores.values()), columns=['model', 'accuracy', 'f1 micro', 'f1 macro'])
acc_ds.to_csv(f'../results/opinions_f1{target}_{lang}.csv', index=False)
acc_ds.head()

Unnamed: 0,model,accuracy,f1 micro,f1 macro
0,random,0.473815,0.473815,0.33448
1,NB,0.591022,0.591022,0.274062
2,SGD,0.573566,0.573566,0.459578
3,LinearSVC,0.548628,0.548628,0.454663


In [619]:
types = X_test.groupby(['type'])


for name, group in types:
    for clf_name in list(acc_scores.keys()):
        #TODO: use only pos and neg inside groups
        fscore=metrics.f1_score(group['y_test'], group[clf_name], average='micro') 
        X_test.loc[X_test.type==name, 'f1' + clf_name] = fscore
        #print(fscore)

In [620]:
f1_types = X_test.groupby('type').mean()
f1_types = f1_types.reset_index()
f1_types = f1_types[['type','f1random', 'f1NB','f1SGD','f1LinearSVC']]
f1_types.to_csv(f'../results/opinions_f1{target}_types_{lang}.csv', index=False)
f1_types.head()

Unnamed: 0,type,f1random,f1NB,f1SGD,f1LinearSVC
0,agreement,0.5,0.0,0.0,0.0
1,content,0.0,0.5,0.5,0.5
2,criticism,0.5,0.0,1.0,1.0
3,disagreement,0.0,0.0,0.0,0.0
4,doubing,0.0,0.0,0.0,0.0


In [621]:
import os

out_file = '../dataset/wiki/opinions_predicted.csv'
ds_out = None

if os.path.exists(out_file):
    print('reading output file')
    ds_out = pd.read_csv(out_file)
else:
    print('reading annotated file')
    ds_out = pd.read_csv('../dataset/wiki/opinions_annotated.csv')
    

ix = ds_out.lang==lang
X_ds = vectorizer.transform(ds_out[ix].text)
ds_out.loc[ix, target + '_pred'] = clf.predict(X_ds)
ds_out.to_csv(out_file, index=False)
print(f'predicting target:{target}, lang:{lang} size:{ds_out.shape}.. done')

reading output file
predicting target:sentiment, lang:en size:(12104, 33).. done


In [622]:
ds_out.columns

Index(['index', 'page_id', 'page_title', 'revision_id', 'turn_id',
       'contributor', 'timestamp', 'topic', 'raw', 'text', 'type', 'lang',
       'creation_dt', 'revision_uid', 'turn_uid', 'page_url', 'country', 'cc2',
       'cc3', 'area', 'sent_score', 'sent_magnitude', 'sentiment', 'type1',
       'stance1', 'sentiment1', 'type2', 'stance2', 'sentiment2', 'stance',
       'sentiment_y', 'stance_pred', 'sentiment_pred'],
      dtype='object')

In [623]:
print(ds_out[ds_out[target + '_pred'].isnull()].shape)

(0, 33)
