In [1]:
import pandas as pd
import numpy as np
import os
import gensim
import nltk
import pickle
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
from tpot import TPOTClassifier
from datetime import datetime
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import SCORERS, accuracy_score, f1_score
import h2o
from h2o.automl import H2OAutoML

stop_words = set(nltk.corpus.stopwords.words('english'))
DATE_FORMAT = "%Y-%m-%d"
MIN_DATE = datetime.strptime('2009-07-07', DATE_FORMAT)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jamil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jamil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def get_terms(terms_file):
    term_dict = {}
    
    for line in terms_file:
        key, word = line.split(",")
        term_dict[int(key)] = word.split("\n")[0]
        
    return term_dict

def get_labels(labels_file):
    curr_article = -1
    labels_votes = []
    article_dict = {}

    for line in labels_file:
        split_line = line.split(",")

        if int(split_line[0]) != curr_article:
            curr_article = int(split_line[0])
            article_dict[curr_article] = True
            labels_votes.append([0, 0, 0]) # index 0 = negative, 1 = irrelevant, 2 = positive

        if 'negative' in split_line[2]:
            labels_votes[-1][0] += 1
        elif 'irrelevant' in split_line[2]:
            labels_votes[-1][1] += 1
        else:
            labels_votes[-1][2] += 1

    labels = []

    for vote_counts in labels_votes:
        labels.append(np.argmax(vote_counts))
        
    return labels, article_dict

def create_bow_and_features(word_list, term_dict):
    word_list = word_list.split(",")
    
    doc = []
    
    for i, entry in enumerate(word_list):
        if i > 2:
            word, freq = entry.split(":")

            if term_dict[int(word)] not in stop_words:
                doc.append((int(word), int(freq)))
            
    curr_date = datetime.strptime(word_list[1], DATE_FORMAT)
    delta = curr_date - MIN_DATE
    return doc, word_list[0], delta.days, word_list[2]

def preprocess_datasets(articles_dir, terms_dir, annotations_dir, preload_lda=False):
    terms_file = open(terms_dir)
    term_dict = get_terms(terms_file)
    
    dataset = []
    features = []
    articles_file = open(articles_dir)

    for line in articles_file:
        article, article_id, date_in_days, provider = create_bow_and_features(line, term_dict)
        dataset.append(article)
        features.append([article_id, provider, date_in_days])

    features = pd.DataFrame(features)
    
    ldamodel = None
    num_topics = 30
    
    if preload_lda is False:
        ldamodel = gensim.models.ldamodel.LdaModel(dataset, num_topics = num_topics, passes=15)
        ldamodel.save('lda/irish_sentiment/model5.gensim')
    else:
        ldamodel =  gensim.models.ldamodel.LdaModel.load('lda/irish_sentiment/model5.gensim')
        
    topic_features = []

    for doc in dataset:
        row = [0.0 for i in range(num_topics)]

        for topic, prob in ldamodel[doc]:
            row[topic] = prob

        topic_features.append(row)

    topic_features = pd.DataFrame(topic_features)
    
    features = pd.concat([features, topic_features], axis=1)

    columns = ['id', 'publisher', 'date_in_days']
    topic_columns = [i for i in range(num_topics)]
    columns = np.concatenate([columns, topic_columns])
    features.columns = columns
    
    labels_file = open(annotations_dir)
    labels, article_dict = get_labels(labels_file)
    
    drop_list = []

    for index, row, in features.iterrows():
        if int(row['id']) not in article_dict:
            drop_list.append(index)
            
    features = features.drop(drop_list, axis=0)
    features = features.drop('id', axis=1)
    features = pd.get_dummies(features)
    features = features.reset_index()
    features = features.drop('index', axis=1)
    
    return features, np.array(labels)

In [4]:
terms_dir = r'datasets\irish_sentiment\sentiment_all_terms.csv'
articles_dir = r'datasets\irish_sentiment\sentiment_all_articles.csv'
annotations_dir = r'datasets\irish_sentiment\sentiment_all_annotations.csv'

features, labels = preprocess_datasets(articles_dir, terms_dir, annotations_dir, preload_lda=True)


In [5]:
# TPOT Testing
kf = KFold(5, shuffle=True, random_state=42)

acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    tpot = TPOTClassifier(max_time_mins=60, verbosity=2)
    tpot.fit(X_train, y_train)
    clf = tpot.fitted_pipeline_
    
    
    acc.append(SCORERS['accuracy'](clf, X_val, y_val))
    f1.append(SCORERS['f1_macro'](clf, X_val, y_val))



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Generation 1 - Current best internal CV score: 0.6453028798411121
Generation 2 - Current best internal CV score: 0.6453028798411121
Generation 3 - Current best internal CV score: 0.6475727053482763
Generation 4 - Current best internal CV score: 0.6475727053482763
Generation 5 - Current best internal CV score: 0.6483359341750603


Traceback (most recent call last):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.

Generation 6 - Current best internal CV score: 0.6505773868633848
Generation 7 - Current best internal CV score: 0.6543367853596254
Generation 8 - Current best internal CV score: 0.6543367853596254


Traceback (most recent call last):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\ensemble\_forest.py", line 392, in fit
    for i, t in enumerate(trees))
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\parallel.py", line 1032, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\parallel.py", line 765, in _dispatch
    job = s

Generation 9 - Current best internal CV score: 0.6543367853596254
Generation 10 - Current best internal CV score: 0.6543367853596254
Generation 11 - Current best internal CV score: 0.6543367853596254
Generation 12 - Current best internal CV score: 0.6551028514682934
Generation 13 - Current best internal CV score: 0.6565952617392538
Generation 14 - Current best internal CV score: 0.6588509008369982
Generation 15 - Current best internal CV score: 0.6588509008369982
Generation 16 - Current best internal CV score: 0.6588707618101859
Generation 17 - Current best internal CV score: 0.6588707618101859
Generation 18 - Current best internal CV score: 0.6596056178181303
Generation 19 - Current best internal CV score: 0.6596056178181303
Generation 20 - Current best internal CV score: 0.6596056178181303
Generation 21 - Current best internal CV score: 0.6596056178181303
Generation 22 - Current best internal CV score: 0.6596084551000142
Generation 23 - Current best internal CV score: 0.6596084551000

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Traceback (most recent call last):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.

Generation 1 - Current best internal CV score: 0.6384934033196198
Generation 2 - Current best internal CV score: 0.6384934033196198
Generation 3 - Current best internal CV score: 0.650568875017733
Generation 4 - Current best internal CV score: 0.650568875017733
Generation 5 - Current best internal CV score: 0.650568875017733
Generation 6 - Current best internal CV score: 0.650568875017733
Generation 7 - Current best internal CV score: 0.650568875017733
Generation 8 - Current best internal CV score: 0.65126968364307
Generation 9 - Current best internal CV score: 0.65126968364307
Generation 10 - Current best internal CV score: 0.65126968364307
Generation 11 - Current best internal CV score: 0.65126968364307
Generation 12 - Current best internal CV score: 0.6520783089799972
Generation 13 - Current best internal CV score: 0.6520783089799972
Generation 14 - Current best internal CV score: 0.6550574549581502
Generation 15 - Current best internal CV score: 0.6550574549581502
Generation 16 - C

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Generation 1 - Current best internal CV score: 0.6490282309547454
Generation 2 - Current best internal CV score: 0.6550432685487303
Generation 3 - Current best internal CV score: 0.6588282025819265
Generation 4 - Current best internal CV score: 0.6595772449992906
Generation 5 - Current best internal CV score: 0.6595772449992906
Generation 6 - Current best internal CV score: 0.6595772449992906
Generation 7 - Current best internal CV score: 0.6611008653709746
Generation 8 - Current best internal CV score: 0.6625989502057029
Generation 9 - Current best internal CV score: 0.6625989502057029
Generation 10 - Current best internal CV score: 0.6633309689317635
Generation 11 - Current best internal CV score: 0.6633309689317635
Generation 12 - Current best internal CV score: 0.6641055468860831
Generation 13 - Current best internal CV score: 0.6641055468860831
Generation 14 - Current best internal CV score: 0.6648574265853312
Generation 15 - Current best internal CV score: 0.6671244148106116
Gene

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Generation 1 - Current best internal CV score: 0.6324840402894027
Generation 2 - Current best internal CV score: 0.6324840402894027
Generation 3 - Current best internal CV score: 0.6438076322882679
Generation 4 - Current best internal CV score: 0.6438076322882679
Generation 5 - Current best internal CV score: 0.6438076322882679
Generation 6 - Current best internal CV score: 0.6460519222584764
Generation 7 - Current best internal CV score: 0.6460519222584764
Generation 8 - Current best internal CV score: 0.6460519222584764
Generation 9 - Current best internal CV score: 0.6468265002127962
Generation 10 - Current best internal CV score: 0.6468265002127962
Generation 11 - Current best internal CV score: 0.6468265002127962
Generation 12 - Current best internal CV score: 0.6468265002127962
Generation 13 - Current best internal CV score: 0.6490764647467726
Generation 14 - Current best internal CV score: 0.6490764647467726


Traceback (most recent call last):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\tpot\builtins\one_hot_encoder.py", line 399, in fit_transform

Generation 15 - Current best internal CV score: 0.6490764647467726
Generation 16 - Current best internal CV score: 0.6490764647467726


Traceback (most recent call last):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.

Generation 17 - Current best internal CV score: 0.6490764647467726
Generation 18 - Current best internal CV score: 0.6490764647467726
Generation 19 - Current best internal CV score: 0.6490764647467726
Generation 20 - Current best internal CV score: 0.6490906511561924
Generation 21 - Current best internal CV score: 0.6566122854305576
Generation 22 - Current best internal CV score: 0.6566122854305576
Generation 23 - Current best internal CV score: 0.6566122854305576
Generation 24 - Current best internal CV score: 0.6566122854305576
Generation 25 - Current best internal CV score: 0.6566122854305576
Generation 26 - Current best internal CV score: 0.6566122854305576
Generation 27 - Current best internal CV score: 0.6566122854305576
Generation 28 - Current best internal CV score: 0.6566122854305576
Generation 29 - Current best internal CV score: 0.6566122854305576
Generation 30 - Current best internal CV score: 0.6566122854305576

60.01 minutes have elapsed. TPOT will close down.
TPOT closed

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Generation 1 - Current best internal CV score: 0.6324982266988226
Generation 2 - Current best internal CV score: 0.6362689743225989
Generation 3 - Current best internal CV score: 0.6362689743225989


Traceback (most recent call last):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\sklearn\linear_model\_logistic.py", line 1415, in fit
    for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\parallel.py", line 1029, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\joblib\parallel.p

Generation 4 - Current best internal CV score: 0.6362689743225989
Generation 5 - Current best internal CV score: 0.6407830897999716
Generation 6 - Current best internal CV score: 0.6407830897999716
Generation 7 - Current best internal CV score: 0.6452943679954604
Generation 8 - Current best internal CV score: 0.6452943679954604
Generation 9 - Current best internal CV score: 0.6453000425592282
Generation 10 - Current best internal CV score: 0.6453000425592282
Generation 11 - Current best internal CV score: 0.6453000425592282
Generation 12 - Current best internal CV score: 0.6460661086678962
Generation 13 - Current best internal CV score: 0.6460661086678962
Generation 14 - Current best internal CV score: 0.6468009646758406
Generation 15 - Current best internal CV score: 0.6520726344162292
Generation 16 - Current best internal CV score: 0.6520726344162292
Generation 17 - Current best internal CV score: 0.6520726344162292
Generation 18 - Current best internal CV score: 0.6520726344162292
G

In [6]:
print(acc)
print(f1)
print(neg_loss)

[0.6475903614457831, 0.6234939759036144, 0.6325301204819277, 0.6807228915662651, 0.6746987951807228]
[0.5993773785237396, 0.5375023527197441, 0.6079613095238096, 0.6548687331913934, 0.6408440686855803]
[]


In [8]:
# H2O Testing
h2o.init()
str_labels = ["c" + str(x) for x in labels]
combined = pd.concat([features, pd.DataFrame(str_labels, columns=["class"]).astype(str)], axis=1)
h2o_dataset = h2o.H2OFrame(combined)
h2o_dataset

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Zulu11.2+3 (build 11.0.1+13-LTS, mixed mode)
  Starting server from C:\Users\Jamil\Anaconda3\envs\NLP\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Jamil\AppData\Local\Temp\tmpt2jlrru_
  JVM stdout: C:\Users\Jamil\AppData\Local\Temp\tmpt2jlrru_\h2o_Jamil_started_from_python.out
  JVM stderr: C:\Users\Jamil\AppData\Local\Temp\tmpt2jlrru_\h2o_Jamil_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,6 days
H2O_cluster_name:,H2O_from_python_Jamil_5f9k98
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.984 Gb
H2O_cluster_total_cores:,6
H2O_cluster_allowed_cores:,6


Parse progress: |█████████████████████████████████████████████████████████| 100%


date_in_days,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,publisher_independent,publisher_irishtimes,publisher_rte,class
1,0.0,0.0,0.0,0.172441,0.0,0,0,0,0.0823194,0.318479,0.0,0.0349418,0.0,0.0,0,0.0,0.0173897,0.0,0,0.0,0.0,0.0,0.253905,0.0,0.0232926,0.0,0.0,0.0,0.0,0.0856273,1,0,0,c0
1,0.0183763,0.197183,0.0,0.0,0.0,0,0,0,0.0801676,0.0,0.0,0.0,0.0353892,0.0,0,0.564642,0.0,0.0,0,0.0,0.0,0.0,0.0470004,0.044255,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,c2
1,0.0,0.0,0.0,0.0,0.0455377,0,0,0,0.352749,0.0840476,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0472732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0229077,0.0,0.44111,1,0,0,c0
1,0.0,0.0,0.0285712,0.0,0.075246,0,0,0,0.180741,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0380127,0.0123964,0.0,0.0715387,0.549619,0.0,0.0,0.0,0.0,0.0,0.0327991,1,0,0,c1
1,0.0545967,0.0684742,0.0,0.0,0.0,0,0,0,0.226768,0.0,0.0640619,0.0599077,0.0,0.0,0,0.106851,0.245072,0.0,0,0.0376349,0.0398107,0.0,0.0268288,0.0,0.0104615,0.0,0.0465236,0.0,0.0,0.0,1,0,0,c0
1,0.0,0.0,0.0,0.0,0.0,0,0,0,0.595213,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.111763,0.0,0.0555266,0.080454,0.0525433,0.0,0.0681247,0.01563,0.0,0.0,0.0149856,1,0,0,c1
1,0.0,0.0,0.0,0.0,0.053908,0,0,0,0.0,0.0,0.0,0.0,0.0,0.112804,0,0.145246,0.0,0.0,0,0.455874,0.0,0.0720247,0.08981,0.0,0.0,0.061805,0.0,0.0,0.0,0.0,0,1,0,c2
1,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.692612,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215381,0.0,0.0,0.0835987,0,1,0,c0
1,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.586376,0.0,0.0375144,0.0,0.0,0,0.0,0.0,0.0,0,0.0510444,0.0256627,0.0,0.273435,0.0,0.0,0.0,0.0158067,0.0,0.0,0.0,0,1,0,c2
1,0.0,0.0,0.0,0.0,0.0,0,0,0,0.114227,0.0,0.0659834,0.0,0.220251,0.0,0,0.0,0.0,0.156956,0,0.0167668,0.0,0.0,0.129265,0.0609422,0.0340686,0.0,0.0,0.165624,0.0150207,0.0,0,1,0,c1




In [9]:
x = h2o_dataset.columns[:-1]
y = h2o_dataset.columns[-1]

In [None]:
acc = []
f1 = []
counter = 0

for train_ind, val_ind in kf.split(h2o_dataset):
    print("> Iteration", counter)
    if counter != 0:
        train = h2o.H2OFrame(combined.iloc[train_ind])
        val = h2o.H2OFrame(combined.iloc[val_ind])
        val_labels = combined.iloc[val_ind]['class']

        aml = H2OAutoML(max_runtime_secs=3600, seed=1)
        aml.train(x=x,y=y, training_frame=train)
        pred = aml.leader.predict(val)
        pred = h2o.as_list(pred[:, 0])

        acc.append(accuracy_score(pred, val_labels))
        f1.append(f1_score(pred, val_labels, average='macro'))
        
    counter += 1
    

> Iteration 0
> Iteration 1
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |
11:19:39.435: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████

In [30]:
# SVM Testing
acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    svm = SVC(kernel='linear')
    svm.fit(X_train, y_train)
    
    acc.append(SCORERS['accuracy'](svm, X_val, y_val))
    f1.append(SCORERS['f1_macro'](svm, X_val, y_val))

In [35]:
# Random Forest Testing
acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    
    acc.append(SCORERS['accuracy'](rf, X_val, y_val))
    f1.append(SCORERS['f1_macro'](rf, X_val, y_val))