# Ταξινόμηση ιδιοτήτων-προτάσεων σε συστήματα οργάνων

In [27]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords') 
from nltk.corpus import stopwords
import string
import spacy
import el_core_news_md
nlp = el_core_news_md.load()
from greek_stemmer import GreekStemmer
stemmer = GreekStemmer()
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

total_stop_words = (nlp.Defaults.stop_words).union(set(stopwords.words('greek')))
total_stop_words = total_stop_words.union({'ή', 'μόνον'})
total_puncts = string.punctuation + '«»–“”\xa0‘•…●\uf0b7◗♥.'
total_stop_words = list(total_stop_words) + list(string.punctuation)
#total_stop_words += [stemming(word) for word in total_stop_words]
stop_words = set(total_stop_words)
tokenized_stop_words = nltk.word_tokenize(' '.join(total_stop_words))

class Tokenizer(object):
    def __init__(self):
        nltk.download('punkt', quiet=True, raise_on_error=True)
        self.stemmer = GreekStemmer()
        
    def _stem(self, token):
        if (token in stop_words):
            return token
        return self.stemmer.stem((removing_accents(token)).upper())
        
    def __call__(self, line):
        tokens = nltk.word_tokenize(line)
        tokens = filter_punctuation(list(tokens))
        tokens = group_numbers(tokens)
        tokens = (self._stem(token) for token in tokens)
        return tokens



def data_preparation(data_filename, target, test_size=0.25, stopwords=tokenized_stop_words, max_frequency=0.5, min_frequency=6, 
                     imbalanced=False):
    df_texts = pd.read_csv(data_filename, index_col=None)
    df_texts = df_texts.dropna(axis=0, subset = [target])
    df_texts[target] = df_texts[target].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(df_texts['Sentence'], df_texts[target], test_size=test_size, 
                                                        random_state=10)
    vectorizer = TfidfVectorizer(max_df=max_frequency, min_df=min_frequency, stop_words=stopwords, tokenizer=Tokenizer())
    X_train_tf_idf = vectorizer.fit_transform(X_train)
    X_test_tf_idf = vectorizer.transform(X_test)
    if imbalanced:
        ros = RandomOverSampler(random_state=0)
        X_train_tf_idf, y_train = ros.fit_sample(X_train_tf_idf, y_train)
    
    return X_train_tf_idf, X_test_tf_idf, y_train, y_test



def filter_punctuation(words):
    filtered_words = []
    for word in words:
        pun = []
        for letter in word:
            pun.append(letter in total_puncts)

        if not all(pun):
            word = word.strip(total_puncts)
            
            if (len(word)>3):
                for punct in total_puncts:
                    word = word.replace(punct, ' ')
            else:
                for punct in total_puncts:
                    word = word.replace(punct, '')
                    
            tokens = nltk.word_tokenize(word)
            
            for token in tokens:
                filtered_words.append(token)
                
    return filtered_words



def group_numbers(words):
    new_words = []
    for word in words:
        if re.fullmatch(r'[0-9]+', word) != None :
            if int(word) >= 1800 and int(word) <= 2020:
                word = '2000'
            else:
                word = '1'
        new_words.append(word)
    return new_words



def removing_accents(word):
    """
    Removes accents from a given word.
    
    Parameters
    ----------
    word : str
        the word from which we want to remove the accents

    Returns
    -------
    str
        the word without accents
    """
    return (word.replace('ά', 'α').replace('έ', 'ε').replace('ή', 'η').replace('ί', 'ι').replace('ό', 'ο').replace('ύ', 'υ')
            .replace('ώ', 'ω').replace('ϊ', 'ι').replace('ϋ', 'υ').replace('ΐ', 'ι').replace('ΰ', 'υ'))



def stemming(word):
    """
    Implements stemming for a given word.
    
    Parameters
    ----------
    word : str
        the word that is about to be stemmed

    Returns
    -------
    str
        the stem of the given word at uppercase letters
    """
    return stemmer.stem((removing_accents(word)).upper())



def classification(dataset_filename, target_column, classifier, test_size=0.25, stopwords=tokenized_stop_words, 
                   max_frequency=0.5, min_frequency=6, imbalanced=False):    
    X_train, X_test, y_train, y_test = data_preparation(dataset_filename, target_column, imbalanced=imbalanced)
    
    switcher = {'svc': SVC(), 'knn': KNeighborsClassifier(n_neighbors=20), 'naive_bayes': GaussianNB(), 
                'decision_tree': DecisionTreeClassifier(), 'random_forest': RandomForestClassifier()} 
    bin_clf = switcher.get(classifier)
    
    clf = OneVsRestClassifier(bin_clf).fit(X_train, y_train)
    preds = clf.predict(X_test)
    
    results = list(zip(X_test, y_test, preds))
    accuracy = accuracy_score(y_test, preds)
    return results, accuracy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Πρώτα δοκιμάζουμε σύμφωνα με το απλούτσερο annotation

In [88]:
res1, accuracy1 = classification('../data/properties.csv', 'Class1', 'svc')

In [89]:
accuracy1

0.9920477137176938

Έπειτα, δοκιμάζουμε το πιο μεγάλο annotated dataset

In [90]:
res2, accuracy2 = classification('../data/properties.csv', 'Class2', 'svc')

In [91]:
accuracy2

0.8609098811938568

In [20]:
res3, accuracy3 = classification('../data/properties.csv', 'Class2', 'knn')

In [21]:
accuracy3

0.8232396406838598

In [22]:
res4, accuracy4 = classification('../data/properties.csv', 'Class2', 'decision_tree')

In [23]:
accuracy4

0.8777166038829325

In [28]:
res5, accuracy5 = classification('../data/properties.csv', 'Class2', 'random_forest')

In [29]:
accuracy5

0.9235004346566212