University of Zagreb<br>
Faculty of Electrical Engineering and Computing

# Text Analysis and Retrieval (TAR)

<a href="http://www.fer.unizg.hr/predmet/apt">http://www.fer.unizg.hr/predmet/apt</a>

2015./2016.

# Project theme 13: Tweet classification


(c) 2016 Group nedovrs: Tomislav Marinković, Josip Milić, Domagoj Pereglin

*Version 0.95*

Date: **05.06.2016.**<br>

<h1>Tweet classifiers</h1>

<h3>Packages:</h3>

In [1]:
# -*- coding: utf-8 -*-
import os, re, time, string, pandas, pickle
import numpy as np

from subprocess import call

from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.cross_validation import train_test_split, cross_val_score


from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
presentation_mode = False

<h3>Used directories:</h3>

In [5]:
annotations_dir  = '../annotations/'
preprocessed_dir = 'processed/'
intermediate_dir = 'intermediate/'

<h3>Tweet classes (categories):</h3>

In [6]:
tweet_categories = {}
tweet_categories[2] = 'DEALS'
tweet_categories[3] = 'NEWS_TECHNOLOGY'
tweet_categories[4] = 'NEWS_POLITICS'
tweet_categories[5] = 'NEWS_SPORT'
tweet_categories[6] = 'NEWS_REST'
tweet_categories[7] = 'REST'

<h3>Annotations information loading:</h3>

In [7]:
annotations_counter = {}
annotations_file_names = os.listdir(annotations_dir)
dataset_names = [x.split('_')[1] for x in annotations_file_names]
print 'Annotated datasets: %s\n' %  ', '.join(dataset_names)
annotations = []
for annotations_file_name in annotations_file_names:
    annotation_file_lines = open(annotations_dir+annotations_file_name,'r').readlines()
    annotations += [int(x.rstrip().split(';')[-1]) for x in annotation_file_lines]
annotations_set = set(annotations)
for c in annotations_set:
    annotations_counter[c] = len(filter(lambda x : x==c,annotations))

annotation_classes = sorted(annotations_counter.keys())
header = ['Name','Count']
print pandas.DataFrame([(tweet_categories[c],annotations_counter[c]) for c in annotation_classes], annotation_classes, header)   

c_max,c_max_count = sorted(annotations_counter.items(),key=lambda x: x[1])[-1]
print '\nSum: %d' % sum(annotations_counter.values())
print 'Number of categories: %d' % len(annotation_classes)
print 'Most common category: %d - %s' % (c_max,tweet_categories[c_max])

Annotated datasets: bugonline, hrtsport, indexhr, oglasnik, politikaplus, posaohr

              Name  Count
2            DEALS    976
3  NEWS_TECHNOLOGY   1043
4    NEWS_POLITICS   1657
5       NEWS_SPORT   1052
6        NEWS_REST   2738
7             REST   3257

Sum: 10723
Number of categories: 6
Most common category: 7 - REST


<h3>Annotated preprocessed dataset loading:</h3>

In [8]:
preprocessed_file_names = os.listdir(preprocessed_dir)
preprocessed = []
for preprocessed_file_name in preprocessed_file_names:
    preprocessed += [x.rstrip().split('<|>') for x in open(preprocessed_dir+preprocessed_file_name,'r')]
preprocessed = map(lambda (s,c): (s.rstrip(),int(c)), preprocessed)

preprocessed = sorted(preprocessed)
print 'Preprocessed dataset size: %d' % len(preprocessed)

Preprocessed dataset size: 10723


<h3>Dataset split:</h3>

In [9]:
train_part = 0.7
x_all = []
y_all = []
for s,c in preprocessed:
    x_all.append(s)
    y_all.append(c)

x_train, x_test, y_train, y_test = train_test_split(x_all,y_all,test_size=1-train_part, random_state=42)

<h3>Special features (words):</h3>

In [10]:
'''
tweet_categories[1] = 'PRIVATE'
tweet_categories[2] = 'DEALS'
tweet_categories[3] = 'NEWS_TECHNOLOGY'
tweet_categories[4] = 'NEWS_POLITICS'
tweet_categories[5] = 'NEWS_SPORT'
tweet_categories[6] = 'NEWS_REST'
tweet_categories[7] = 'REST'
'''

enhance_sets = {}
enhance_sets[1] = set() # (PRIVATE) not used 
enhance_sets[2] = set(['posao','prodaja','stan','automobil','oglasnikpopusti','php','java','css','html','dev','androiddev','mysql','linux','c','programer','javite','sysadmin','developere','natječaj','konobar'])
enhance_sets[3] = set(['seum','uber', 'airbnb','google','amazon','apple','haker','digitalan','gfxbench','mwc','samsung','gaming','twitter'])
enhance_sets[4] = set(['udruga','franak','uhljeb','komisija','đukanović','jokić','šustar','banka'])
enhance_sets[5] = set(['doping','uefacom','ademi','moo','čačić','euro','nogomet','gnkdinamo'])
enhance_sets[6] = set(['naoblaka','vrijeme','sunčan','sunčano','naoblak','lokalan','pljusak','grmljavina','pretežno','pretežan','toplo'])
enhance_sets[7] = set(['poveznica'])

<h3>Creation of TF-IDF vector:</h3>

In [11]:
def enhance_vectors(list_of_data, tfidf_vectors_of_data, print_progress = True):
    enhanced_vectors = tfidf_vectors_of_data.copy()
    enhanced_vectors._shape = (tfidf_vectors_of_data.shape[0],tfidf_vectors_of_data.shape[1]+6)
    indptr = enhanced_vectors.indptr
    data = enhanced_vectors.data
    indices = enhanced_vectors.indices
    
    
    for i in range(tfidf_vectors_of_data.shape[0]):
        words = set(list_of_data[i].strip().split(' '))
        for ann in range(2,8):
            enhancement = len(enhance_sets[ann].intersection(words))
            data = np.insert(data,indptr[i+1],enhancement)
            indices = np.insert(indices,indptr[i+1],tfidf_vectors_of_data.shape[1]+(ann-2))
            for j in range(i+1,indptr.shape[0]):
                indptr[j] += 1
        
        if (i % 1000 == 0) and i != 0:
            if print_progress: print 'Enhanced: %d/%d vectors' % (i,tfidf_vectors_of_data.shape[0])
        
    csr_mat = csr_matrix( (data,indices,indptr), shape=enhanced_vectors.shape )
    return csr_mat

<h3>Vectorizer and train and test TF-IDF vectors creation and enhancement:</h3>

In [12]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,)
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)
print 'Number of training samples (TF-IDF vectors): %d' % X_train.shape[0]
print 'Number of test samples (TF-IDF vectors): %d' % X_test.shape[0]
print 'Vocabulary size: %d' % X_train.shape[1]
print ''


i = 492
print x_train[i]
print 'Example vector (only nonzero values, index = %d) before enhancement:' %i
print 'Vector:', X_train.data[X_train.indptr[i]:X_train.indptr[i+1]].tolist()
print ''

print 'Enhancing %d vectors...' % X_train.shape[0]
if presentation_mode:
    X_train = pickle.load( open( "saved_objects/X_train.pkl", "rb" ) )
else:
    X_train = enhance_vectors(x_train, X_train)
    pickle.dump( X_train, open( "saved_objects/X_train.pkl", "wb" ) )
print 'Enhancement of %d vectors is finished!' % X_train.shape[0]

print ''
print 'Example vector (only nonzero values, index = %d) after enhancement:' %i
print 'Vector:', X_train.data[X_train.indptr[i]:X_train.indptr[i+1]].tolist()

print ''
print 'Enhancing %d vectors...' % X_test.shape[0]
if presentation_mode:
    X_test = pickle.load( open( "saved_objects/X_test.pkl", "rb" ) )
else:
    X_test = enhance_vectors(x_test, X_test)
    pickle.dump( X_test, open( "saved_objects/X_test.pkl", "wb" ) )
print 'Enhancement of %d vectors is finished!' % X_test.shape[0]

Number of training samples (TF-IDF vectors): 7506
Number of test samples (TF-IDF vectors): 3217
Vocabulary size: 14822

arijan ademi danas biti pred @ uefacom iznositi svoj obrana u veza navodan korištenje doping
Example vector (only nonzero values, index = 492) before enhancement:
Vector: [0.34161449859165066, 0.2903083802509154, 0.2582952219082173, 0.26959725666388407, 0.29525031583425176, 0.32660842094820464, 0.2512243552424637, 0.32660842094820464, 0.34161449859165066, 0.3077029989545338, 0.21393204220662432, 0.1765465999287678, 0.08758828385256447]

Enhancing 7506 vectors...
Enhanced: 1000/7506 vectors
Enhanced: 2000/7506 vectors
Enhanced: 3000/7506 vectors
Enhanced: 4000/7506 vectors
Enhanced: 5000/7506 vectors
Enhanced: 6000/7506 vectors
Enhanced: 7000/7506 vectors
Enhancement of 7506 vectors is finished!

Example vector (only nonzero values, index = 492) after enhancement:
Vector: [0.34161449859165066, 0.2903083802509154, 0.2582952219082173, 0.26959725666388407, 0.2952503158342

<h3>Vectorizer and TF-IDF vectors of all data creation and enhancement:</h3>

In [13]:
vectorizer_all = TfidfVectorizer(sublinear_tf=True, max_df=0.7,)
X_all = vectorizer_all.fit_transform(x_all)

print 'Enhancing %d vectors...' % X_all.shape[0]
if presentation_mode:
    X_all = pickle.load( open( "saved_objects/X_all.pkl", "rb" ) )
else:
    X_all = enhance_vectors(x_all, X_all)
    pickle.dump( X_all, open( "saved_objects/X_all.pkl", "wb" ) )
print 'Enhancement of %d vectors is finished!' % X_all.shape[0]

Enhancing 10723 vectors...
Enhanced: 1000/10723 vectors


KeyboardInterrupt: 

<h2>Classifiers:</h2>

<h3>SVM classifier creation with optimal C:</h3>

In [14]:
def get_svm_classifier(X_train, y_train):
    print 'Searching for optimal SVM classifier...'
    parameters = {'C': [2*float(x)/20 for x in list(range(1,20,1))]}
    print 'Parameters search:', parameters
    #Cs = np.logspace(-10, -1, 10)
    clf_svm = LinearSVC()
    clf_svm_grid = GridSearchCV(estimator=clf_svm, param_grid=parameters, n_jobs=1)
    clf_svm_grid.fit(X_train,y_train)
    clf_svm = clf_svm_grid.best_estimator_
    return clf_svm

<h3>LogReg classifier creation with optimal C:</h3>

In [15]:
def get_logReg_classifier(X_train,y_train):
    print 'Searching for optimal LogReg classifier...'
    #clf_log_reg = LogisticRegression(C = 1)
    parameters = {'C': [float(x)/4 for x in list(range(1,180,10))]}
    print 'Parameters search:', parameters
    clf_log_reg = LogisticRegression()
    clf_log_reg_grid = GridSearchCV(estimator=clf_log_reg, param_grid=parameters,n_jobs=1)
    clf_log_reg_grid.fit(X_train,y_train)
    clf_log_reg = clf_log_reg_grid.best_estimator_
    return clf_log_reg

<h3>KNN classifier creation with optimal number of neighbors:</h3>

In [16]:
def get_knn_classifier(X_train,y_train):
    print 'Searching for optimal KNN classifier...'
    #clf_knn = KNeighborsClassifier(n_neighbors=40)
    parameters = {'n_neighbors':list(range(1,50,3))}
    print 'Parameters search:', parameters
    clf_knn = KNeighborsClassifier()
    clf_knn_reg_grid = GridSearchCV(estimator=clf_knn, param_grid=parameters, n_jobs=1)
    clf_knn_reg_grid.fit(X_train,y_train)
    clf_knn = clf_knn_reg_grid.best_estimator_
    return clf_knn

<h3>Dummy classifiers creation:</h3>

In [17]:
class TotallyDumbClassifier():
    def fit(self,_,y_train):
        self.most_common = max(set(y_train), key=y_train.count)
    def predict(self,x_test):
        return np.array([self.most_common]*x_test.shape[0])
    def __str__(self):
        
        return "TotallyDumbClassifier(most_common='%d')" % self.most_common
        
def get_dummy_classifier(X_train,y_train):
    clf_dummy = DummyClassifier(random_state=42)
    clf_dummy.fit(X_train, y_train) 
    return clf_dummy

def get_dummy_custom_classifier(X_train,y_train):
    clf_dummy_custom = TotallyDumbClassifier()
    clf_dummy_custom.fit(X_train, y_train)
    return clf_dummy_custom   

<h3>Classifiers evaluation:</h3>

In [21]:
def stat_latex():
    lat = ''
     
    return lat

def statistics(predict,y_test):
    print 'Number of predictions per category:'
    counter = {}
    for p in annotation_classes:
        counter[p] = 0
        
    for p in predict:
        counter[p] += 1
    for p in annotation_classes:
        print p,'-', counter[p]
    b = 0
    for i in range(len(predict)):
        if predict[i] == y_test[i]:
            b += 1
    print ''
    print 'Confusion matrix:'
    cm = confusion_matrix(y_test, predict)
 
    print pandas.DataFrame.from_items([(tweet_categories[annotation_classes[i]],cm[i]) for i in range(len(annotation_classes))],orient='index', columns=annotation_classes)
    print ''
    precisions, recalls, F1s, supports = precision_recall_fscore_support(y_test, predict)
    matrix_items = [(tweet_categories[annotation_classes[i]],[precisions[i],recalls[i],F1s[i]]) for i in range(len(annotation_classes))]
    matrix_items.append(('Σ',precision_recall_fscore_support(y_test, predict, average='macro')[:-1]))
    
    print matrix_items
    print pandas.DataFrame.from_items(matrix_items,orient='index', columns=['Precision','Recall','F1'])   

    print ''
    print "Accuracy:  %f %s" % ( accuracy_score(y_test, predict)*100,'%')
    
def evaluate_classifiers(clfs, X_test, y_test, X_all = None, y_all = None, cross_evaluation = False, cv = None):
    for i in range(len(clfs)):
        print clf_names[i]+'\n'
        clf = clfs[i]
        print clf

        pred = clf.predict(X_test)
        statistics(pred,y_test)
        if clf != clf_dummy_custom:
            if cross_evaluation:
                print 'Calculating cross-validation (cv = %d)...' % cv
                scores = cross_val_score(clf, X_all, y_all, cv=cv,scoring='f1_weighted')
                print scores
                print("Cross-validated accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std() * 2))

        print '-'*70+'\n'
    

<h3>All used classifiers:</h3>

In [19]:
clf_dummy = get_dummy_classifier(X_train,y_train)
clf_dummy_custom = get_dummy_custom_classifier(X_train,y_train)
if presentation_mode:
    clf_svm = pickle.load( open( "saved_objects/clf_svm.pkl", "rb" ) )
    clf_log_reg = pickle.load( open( "saved_objects/clf_log_reg.pkl", "rb" ) )
    clf_knn = pickle.load( open( "saved_objects/clf_knn.pkl", "rb" ) )
else:
    clf_svm = get_svm_classifier(X_train,y_train)
    pickle.dump( clf_svm, open( "saved_objects/clf_svm.pkl", "wb" ) )
    
    clf_log_reg = get_logReg_classifier(X_train,y_train)
    pickle.dump( clf_log_reg, open( "saved_objects/clf_log_reg.pkl", "wb" ) )
    
    clf_knn = get_knn_classifier(X_train,y_train)
    pickle.dump( clf_knn, open( "saved_objects/clf_knn.pkl", "wb" ) )
    
    
clfs = [clf_dummy,clf_dummy_custom,clf_svm,clf_log_reg,clf_knn]

for clf in clfs:
    print clf
    print ''

clf_names = ['Dummy classifier','Totally dumb classifier (returns most common)','Support vector machine','Logistic regression','k-nearest neighbors']

Searching for optimal SVM classifier...
Parameters search: {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]}
Searching for optimal LogReg classifier...
Parameters search: {'C': [0.25, 2.75, 5.25, 7.75, 10.25, 12.75, 15.25, 17.75, 20.25, 22.75, 25.25, 27.75, 30.25, 32.75, 35.25, 37.75, 40.25, 42.75]}
Searching for optimal KNN classifier...
Parameters search: {'n_neighbors': [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]}
DummyClassifier(constant=None, random_state=42, strategy='stratified')

TotallyDumbClassifier(most_common='7')

LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

LogisticRegression(C=15.25, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_st

In [22]:
evaluate_classifiers(clfs, X_test, y_test, X_train, y_train, cross_evaluation = True, cv = 10)

Dummy classifier

DummyClassifier(constant=None, random_state=42, strategy='stratified')
Number of predictions per category:
2 - 277
3 - 332
4 - 462
5 - 294
6 - 798
7 - 1054

Confusion matrix:
                  2   3    4   5    6    7
DEALS            28  34   41  32   66  101
NEWS_TECHNOLOGY  32  46   39  25   79  105
NEWS_POLITICS    46  61   77  54  119  169
NEWS_SPORT       27  23   43  23   85  112
NEWS_REST        66  81  104  72  221  264
REST             78  87  158  88  228  303

[('DEALS', [0.10108303249097472, 0.092715231788079472, 0.096718480138169263]), ('NEWS_TECHNOLOGY', [0.13855421686746988, 0.1411042944785276, 0.1398176291793313]), ('NEWS_POLITICS', [0.16666666666666666, 0.14638783269961977, 0.15587044534412955]), ('NEWS_SPORT', [0.078231292517006806, 0.073482428115015971, 0.075782537067545314]), ('NEWS_REST', [0.27694235588972432, 0.27351485148514854, 0.27521793275217937]), ('REST', [0.28747628083491461, 0.321656050955414, 0.30360721442885774]), ('\xce\xa3', (0.17482

<h3>Preprocessing and classification of tweet text:</h3>

In [89]:
def purge(tweet_text):
    purged = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', tweet_text).strip()
    purged_spl = purged.split(' ')
    if (purged) and (purged_spl[-1][0] in '^'):
        purged = ' '.join(purged_spl[:-1]).strip()
    for forbidden in '!&.?-–^,;:"\'/[]{}()<>':
        while forbidden in purged:
            purged = purged.replace(forbidden,'')
    
    purged = purged.strip()
    if purged == '':
        return 'poveznica'
    return purged

def classify_tweet_text(tweet_text, clfs, vectorizer):
    target_name = intermediate_dir + 'purged_tweet.txt'
    
    open(target_name,'w').write(purge(tweet_text)+'\n')
    command = ['"NLPToolkit.exe"','-pre', '-pos', '-lm', '"'+target_name+'"', '"'+target_name.replace('purged','pre_purged')+'"']
    #print ' '.join(command)
    call(' '.join(command),shell=True)
    f = open(target_name.replace('purged','pre_purged'),'r')
    processed_tweet_text = ''
    for line in f:
        if ( line != "\n"):
            word = line.split('\t')[2].strip().lower()
            processed_tweet_text += word+' '
        else:
            processed_tweet_text = processed_tweet_text.rstrip()
    f.close()
    os.remove(target_name)
    os.remove(target_name.replace('purged','pre_purged'))
    processed_tweet_text = processed_tweet_text
    print 'Original text: "%s"' % tweet_text
    print 'Preprocessed text: "%s"' % processed_tweet_text
    print ''
    X_tweet = enhance_vectors([processed_tweet_text],vectorizer.transform([processed_tweet_text]), print_progress = False)
    
    for i in range(len(clfs)):
        print clf_names[i],'-',
        clf = clfs[i]
        print tweet_categories[clf.predict(X_tweet)[0]]
        print ''    

<h3>Whole dataset as train set</h3>

In [90]:
clf_dummy_all = get_dummy_classifier(X_all,y_all)
clf_dummy_custom_all = get_dummy_custom_classifier(X_all,y_all)
    
    
if presentation_mode:
    clf_svm_all = pickle.load( open( "saved_objects/clf_svm_all.pkl", "rb" ) )
    clf_log_reg_all = pickle.load( open( "saved_objects/clf_log_reg_all.pkl", "rb" ) )
    clf_knn_all = pickle.load( open( "saved_objects/clf_knn_all.pkl", "rb" ) )
else:
    clf_svm_all = get_svm_classifier(X_all,y_all)
    pickle.dump( clf_svm_all, open( "saved_objects/clf_svm_all.pkl", "wb" ) )
    
    clf_log_reg_all = get_logReg_classifier(X_all,y_all)
    pickle.dump( clf_log_reg_all, open( "saved_objects/clf_log_reg_all.pkl", "wb" ) )
    
    clf_knn_all = get_knn_classifier(X_all,y_all)
    pickle.dump( clf_knn_all, open( "saved_objects/clf_knn_all.pkl", "wb" ) )
clfs_all = [clf_dummy_all,clf_dummy_custom_all,clf_svm_all,clf_log_reg_all,clf_knn_all]

for clf in clfs_all:
    print clf
    print ''

clf_names_all = ['Dummy classifier','Totally dumb classifier (returns most common)','Support vector machine','Logistic regression','k-nearest neighbors']

DummyClassifier(constant=None, random_state=42, strategy='stratified')

TotallyDumbClassifier(most_common='7')

LinearSVC(C=0.9, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

LogisticRegression(C=40.25, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=40, p=2, weights='uniform')



<h3>Single tweet classification:</h3>

In [91]:
example = 'Karamarko je rekao da je Petrov lagao'
classify_tweet_text(example,clfs_all,vectorizer_all)

Original text: "Karamarko je rekao da je Petrov lagao"
Preprocessed text: "﻿karamarko biti reći da biti petrov lagati"

Dummy classifier - NEWS_TECHNOLOGY

Totally dumb classifier (returns most common) - REST

Support vector machine - NEWS_POLITICS

Logistic regression - NEWS_POLITICS

k-nearest neighbors - NEWS_POLITICS



<h3>Multiple tweets (file) classification:</h3>

In [92]:
for file_example in open('examples.txt','r'):
    classify_tweet_text(file_example,clfs_all,vectorizer_all)
    print '-'*70+'\n'

Original text: "Hrvatska je kažnjena s dvije utakmice bez navijača u  kvalifikacijama za SP, @UvijekVjerniMan reagirali su na odluku.
"
Preprocessed text: "﻿hrvatska biti kažnjen s dva utakmica bez navijač u kvalifikacija za sp @ uvijekvjerniman reagirati biti na odluka"

Dummy classifier - NEWS_TECHNOLOGY

Totally dumb classifier (returns most common) - REST

Support vector machine - NEWS_SPORT

Logistic regression - NEWS_SPORT

k-nearest neighbors - NEWS_SPORT

----------------------------------------------------------------------

Original text: "Damir Burić više nije trener @hajduk! Novi trener je Slovenac Pušnik ili Željko Kopić. > http://bit.ly/1PkMsMb 
"
Preprocessed text: "﻿damir burić vio biti trener @ hajduk nov trener biti slovenac pušnik ili željko kopić"

Dummy classifier - NEWS_TECHNOLOGY

Totally dumb classifier (returns most common) - REST

Support vector machine - NEWS_SPORT

Logistic regression - NEWS_SPORT

k-nearest neighbors - NEWS_SPORT

--------------------------

<h3>Classifier objects creation:</h3>

In [48]:
clfs_dict = {}
clfs_dict['clf_svm'] = clf_svm_all
clfs_dict['clf_log_reg'] = clf_log_reg_all
clfs_dict['clf_knn'] = clf_knn_all

pickle.dump( clfs_dict, open( "saved_objects/classifiers.pkl", "wb" ) )
pickle.dump( vectorizer_all, open( "saved_objects/vectorizer.pkl", "wb" ) )

In [None]:
clfs_dict = pickle.load( open( "saved_objects/classifiers.pkl", "rb" ) )