In [2]:
import data_helpers
import os
import pandas as pd

load_main = True

main_cats = ['ActionName', 'Capability']
dataframes = {cat : {} for cat in main_cats} if load_main else {}

for cat in main_cats:
    path = os.getcwd() + "/Dataframes/" + cat + "/Processed/"
    ls = os.listdir(path)
    for file in ls:
        if (load_main and cat in file) or (not load_main and cat not in file):
            df = pd.read_csv(path + file, encoding='utf-8')

            df.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
            df.drop(["a"], axis=1, inplace=True)

            df.rename({"Unnamed: 0.1":"a"}, axis="columns", inplace=True)
            df.drop(["a"], axis=1, inplace=True)

            df_type = file.split('-')[1]
            if load_main:
                dataframes[cat][df_type] = df
            else:
                sub_cat = file.split('-')[0]
                if sub_cat not in dataframes.keys():
                    dataframes[sub_cat] = {}
                dataframes[sub_cat][df_type] = df

if not load_main:
    main_cats = list(dataframes.keys())

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from clf_helpers import do_the_pipeline

def dummy(doc):
    return doc

In [21]:
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import classification_report, confusion_matrix

pipeline = imbPipeline([          
    ('vectorizer',
     CountVectorizer(
         preprocessor=dummy, 
         ngram_range=(1, 4),
         analyzer='word',
         binary=False
     )),
    ('smt', 
     SMOTE(
         random_state=42
    )),
    ('clf', 
     LogisticRegression(
         class_weight='balanced', # better results for imbalanced Capability - NoCapability
         solver='liblinear' # podobno: best for small datasets
     ))
])

do_the_pipeline(pipeline, main_cats, dataframes, accuracy=True, report=True, top_features=False, neigh=False)


>>> ActionName:

# RELATIONS
Vectorizer: W słowniku znajduje się 43659 różnych słów
- Zbiór treningowy:
	ACCURACY: 99.552%
              precision    recall  f1-score   support

  ActionName       0.99      1.00      0.99      1154
NoActionName       1.00      0.99      1.00      2194

 avg / total       1.00      1.00      1.00      3348

- Zbiór testowy:
	ACCURACY: 76.7932%
              precision    recall  f1-score   support

  ActionName       0.65      0.66      0.66        80
NoActionName       0.83      0.82      0.82       157

 avg / total       0.77      0.77      0.77       237


>>> Capability:

# RELATIONS
Vectorizer: W słowniku znajduje się 43659 różnych słów
- Zbiór treningowy:
	ACCURACY: 99.4624%
              precision    recall  f1-score   support

  Capability       1.00      0.99      1.00      2817
NoCapability       0.97      1.00      0.98       531

 avg / total       0.99      0.99      0.99      3348

- Zbiór testowy:
	ACCURACY: 80.1688%
              precis

In [26]:
pipeline = imbPipeline([          
    ('vectorizer',
     TfidfVectorizer(
         preprocessor=dummy, 
         ngram_range=(1, 5),
         analyzer='word',
         max_df=0.15,
     )),
    ('smt', 
     SMOTE(
         random_state=42
    )),
    ('clf', 
     LogisticRegression(
         class_weight='balanced', # better results for imbalanced Capability - NoCapability
         solver='liblinear' # podobno: best for small datasets
     )
    )
])

do_the_pipeline(pipeline, main_cats, dataframes, accuracy=True, report=True, top_features=False, neigh=False)


>>> ActionName:

# RELATIONS
Vectorizer: W słowniku znajduje się 53614 różnych słów
- Zbiór treningowy:
	ACCURACY: 97.1027%
              precision    recall  f1-score   support

  ActionName       0.97      0.95      0.96      1154
NoActionName       0.97      0.98      0.98      2194

 avg / total       0.97      0.97      0.97      3348

- Zbiór testowy:
	ACCURACY: 76.3713%
              precision    recall  f1-score   support

  ActionName       0.64      0.69      0.66        80
NoActionName       0.83      0.80      0.82       157

 avg / total       0.77      0.76      0.77       237


>>> Capability:

# RELATIONS
Vectorizer: W słowniku znajduje się 53614 różnych słów
- Zbiór treningowy:
	ACCURACY: 98.2079%
              precision    recall  f1-score   support

  Capability       0.99      0.99      0.99      2817
NoCapability       0.93      0.96      0.94       531

 avg / total       0.98      0.98      0.98      3348

- Zbiór testowy:
	ACCURACY: 80.5907%
              preci

In [29]:
pipeline = imbPipeline([          
    ('vectorizer',
     CountVectorizer(
         preprocessor=dummy, 
         ngram_range=(1, 1),
         analyzer='word'
     )),
    ('smt', 
     SMOTE(
         random_state=42
    )),
    ('clf', 
     MultinomialNB()
    )
])

do_the_pipeline(pipeline, main_cats, dataframes, accuracy=True, report=True, top_features=False, neigh=False)


>>> ActionName:

# RELATIONS
Vectorizer: W słowniku znajduje się 2776 różnych słów
- Zbiór treningowy:
	ACCURACY: 85.693%
              precision    recall  f1-score   support

  ActionName       0.74      0.90      0.81      1154
NoActionName       0.94      0.83      0.88      2194

 avg / total       0.87      0.86      0.86      3348

- Zbiór testowy:
	ACCURACY: 76.3713%
              precision    recall  f1-score   support

  ActionName       0.62      0.79      0.69        80
NoActionName       0.87      0.75      0.81       157

 avg / total       0.79      0.76      0.77       237


>>> Capability:

# RELATIONS
Vectorizer: W słowniku znajduje się 2776 różnych słów
- Zbiór treningowy:
	ACCURACY: 88.0824%
              precision    recall  f1-score   support

  Capability       0.99      0.87      0.92      2817
NoCapability       0.58      0.93      0.71       531

 avg / total       0.92      0.88      0.89      3348

- Zbiór testowy:
	ACCURACY: 82.7004%
              precisio

In [30]:
pipeline = imbPipeline([          
    ('vectorizer',
     TfidfVectorizer(
         preprocessor=dummy, 
         ngram_range=(1, 3),
         analyzer='word',
         max_features=3000
     )),
    ('smt', 
     SMOTE(
         random_state=42
    )),
    ('clf', 
     MultinomialNB()
    )
])

do_the_pipeline(pipeline, main_cats, dataframes, accuracy=True, report=True, top_features=False, neigh=False)


>>> ActionName:

# RELATIONS
Vectorizer: W słowniku znajduje się 3000 różnych słów
- Zbiór treningowy:
	ACCURACY: 87.963%
              precision    recall  f1-score   support

  ActionName       0.78      0.90      0.84      1154
NoActionName       0.94      0.87      0.90      2194

 avg / total       0.89      0.88      0.88      3348

- Zbiór testowy:
	ACCURACY: 75.5274%
              precision    recall  f1-score   support

  ActionName       0.61      0.76      0.68        80
NoActionName       0.86      0.75      0.80       157

 avg / total       0.78      0.76      0.76       237


>>> Capability:

# RELATIONS
Vectorizer: W słowniku znajduje się 3000 różnych słów
- Zbiór treningowy:
	ACCURACY: 89.0382%
              precision    recall  f1-score   support

  Capability       0.99      0.88      0.93      2817
NoCapability       0.60      0.95      0.73       531

 avg / total       0.93      0.89      0.90      3348

- Zbiór testowy:
	ACCURACY: 80.5907%
              precisio