In [1]:
import os
import time
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
TRAIN_FILES_DIR = '../Dane/wiki_train/'
TEST_FILES_DIR = '../Dane/wiki_test/'

In [10]:
def files_to_df(data_part, tagger, part_of_speech):
    df = pd.DataFrame(columns=['text', 'label'])
    
    for file in sorted(os.listdir(data_part)):
        if file.endswith(tagger+'.csv'):
            file_text = ''
            file_class = file.split('_')[0]

            with open(data_part+file, 'r') as f:
                lines = f.readlines()

            for line in lines:
                line = line.replace('\n', '')
                if line.endswith(part_of_speech):
                    #print(line)
                    word = line.split(',')[1]
                    file_text = file_text+' '+word if file_text != '' else file_text+word

            df = df.append({'text': file_text, 
                            'label': file_class}, ignore_index=True)
    
    return df


def train_test_split(df_train, df_test, max_features=10000):
    cv = CountVectorizer(max_features=max_features)
    
    X_train = df_train.text.tolist()
    X_train = cv.fit_transform(X_train).toarray()
    y_train = df_train.label.tolist()

    X_test = df_test.text.tolist()
    X_test = cv.transform(X_test).toarray()
    y_test = df_test.label.tolist()
    
    return X_train, X_test, y_train, y_test

In [11]:
taggers = ['morphoDita', 'wcrft2', 'krnnt']
parts_of_speech = ['noun', 'adjective', 'verb']
max_features_list = [1000, 10000, 100000]

In [5]:
df_res = pd.DataFrame(columns=['max_features', 'tagger', 'part_of_speech', 'accuracy', 'time'])

for max_features in max_features_list:
    for tagger in taggers:
        for part_of_speech in parts_of_speech:
            print('Classification with params: tagger:', tagger, 
                  '- part of speech:', part_of_speech, 
                  '- max features:', max_features, '...')

            start_timer = time.time()
            df_train = files_to_df(data_part=TRAIN_FILES_DIR, 
                                   tagger=tagger, 
                                   part_of_speech=part_of_speech)

            df_test = files_to_df(data_part=TEST_FILES_DIR, 
                                  tagger=tagger, 
                                  part_of_speech=part_of_speech)

            X_train, X_test, y_train, y_test = train_test_split(df_train=df_train, 
                                                                df_test=df_test, 
                                                                max_features=max_features)

            # Naive Bayes
            classifier = GaussianNB()
            classifier.fit(X_train, y_train)

            # Predict Class
            y_pred = classifier.predict(X_test)

            # Accuracy
            accuracy = accuracy_score(y_test, y_pred)

            exec_time = time.time() - start_timer

            #print('Tagger:', tagger, 
            #      '- Part of speech:', part_of_speech, 
            #      '\n\tAccuracy:', accuracy, '\n')

            df_res = df_res.append({'max_features': max_features, 
                                    'tagger': tagger, 
                                    'part_of_speech': part_of_speech, 
                                    'accuracy': accuracy, 
                                    'time': exec_time}, ignore_index=True)

Classification with params: tagger: morphoDita - part of speech: noun - max features: 1000 ...
Classification with params: tagger: morphoDita - part of speech: adjective - max features: 1000 ...
Classification with params: tagger: morphoDita - part of speech: verb - max features: 1000 ...
Classification with params: tagger: wcrft2 - part of speech: noun - max features: 1000 ...
Classification with params: tagger: wcrft2 - part of speech: adjective - max features: 1000 ...
Classification with params: tagger: wcrft2 - part of speech: verb - max features: 1000 ...
Classification with params: tagger: krnnt - part of speech: noun - max features: 1000 ...
Classification with params: tagger: krnnt - part of speech: adjective - max features: 1000 ...
Classification with params: tagger: krnnt - part of speech: verb - max features: 1000 ...
Classification with params: tagger: morphoDita - part of speech: noun - max features: 10000 ...
Classification with params: tagger: morphoDita - part of spee

In [7]:
df_res

Unnamed: 0,max_features,tagger,part_of_speech,accuracy,time
0,1000,morphoDita,noun,0.557738,22.851148
1,1000,morphoDita,adjective,0.453776,20.55483
2,1000,morphoDita,verb,0.278361,21.463108
3,1000,wcrft2,noun,0.544531,25.685656
4,1000,wcrft2,adjective,0.454453,21.767061
5,1000,wcrft2,verb,0.275652,22.459691
6,1000,krnnt,noun,0.560786,37.497144
7,1000,krnnt,adjective,0.444633,20.976613
8,1000,krnnt,verb,0.274297,20.357028
9,10000,morphoDita,noun,0.759228,34.240557


In [8]:
#df_res.to_csv('out/task_4_results.csv')