In [1]:
import os
import sys
import pickle
from inspect import signature

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.preprocessing import normalize

get_dir = os.path.dirname
PROJ_ROOT = get_dir(get_dir(os.path.abspath('__file__')))
print(PROJ_ROOT)
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

from data import dataset
from model import model 

C:\Users\48519\Professional Stuff\various\machine_learning\mouse_disambiguation


In [2]:
with open(os.path.join(dataset.DEFAULT_DATA_MODEL_DIRECTORY, 'data_model.pickle'), 'rb') as f:
    dat_mod = pickle.load(f)
    
class_params = {
    'solver': 'liblinear',
    'class_weight': 'balanced',
    'C': 1.
}

vectorizer_settings = dataset.DEFAULT_VECTORIZER_SETTINGS


pred_model = model.build_mmdisambiguator(
    data_model_params=vectorizer_settings, 
    data_model_path=os.path.join(dataset.DEFAULT_DATA_MODEL_DIRECTORY, 'data_model.pickle'),
    classificator_parameters=class_params
)

In [3]:
# train data
FEATURES_PATH = dataset.DEFAULT_FEATURES_DIRECTORY
TRAIN_FEATURES_PATH = os.path.join(FEATURES_PATH, 'train.npy')
data = np.load(TRAIN_FEATURES_PATH, allow_pickle=True)

In [4]:
# train the classifier and print the report
# normalize features 
# features = data[:,:-1]/np.sum(data[:,:-1], axis=1, keepdims=True)
features = normalize(data[:,:-1], norm='l2', axis=1)
# print(np.sum(features[:5,:]))
report = pred_model.train(features=features,classes=data[:,-1], report=True)
print(report)

{'accuracy': 0.9953271028037384, 'precision': 0.9818181818181818, 'recall': 1.0, 'f1': 0.9908256880733944, 'confussion_matrics': array([[ 54,   0],
       [  1, 159]], dtype=int64)}


In [5]:
# load validation features and text
VALIDATION_FEATURES_PATH = os.path.join(FEATURES_PATH, 'validation.npy')
VALIDATION_TEXT_PATH = os.path.join(dataset.DEFAULT_PROCESSED_TEXT_DATA_DIRECTORY, 'validation.csv')
validaton = np.load(VALIDATION_FEATURES_PATH, allow_pickle=True)
validation_text = pd.read_csv(VALIDATION_TEXT_PATH, sep=';')
print(validation_text.head())
print(validaton[0:5,:])

norm_valid = normalize(validaton[:,:-1])
predicted_valid = pred_model.predict(norm_valid, format='binary', threshold=0.5)
print(predicted_valid[:50])

print(pred_model.performance_report(predicted_valid[:,0], validaton[:,-1]))
# print(np.sum(predicted_valid[:,0]))


                                                text   class
0  Cordless or wireless mouse transmit data via i...  device
1                                         LED mouse   device
2  Common terms rat and mouse are not taxonomical...  animal
3  They christened the device the mouse as early ...  device
4  The great hopping mouse Notomys robustus Mahon...  animal
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
[[0.         0.65517626 0.34482374]
 [0.         0.57429933 0.42570067]
 [1.         0.46395951 0.53604049]
 [0.         0.68661166 0.31338834]
 [1.         0.15981423 0.84018577]
 [0.         0.81034318 0.18965682]
 [0.         0.72451125 0.27548875]
 [1.         0.30147193 0.69852807]
 [0.         0.6148222  0.3851778 ]
 [0.         0.63460537 0.36539463]
 [1.         0.21502864 0.78497136]
 [1.         0.4069962  0.5930038 ]
 [1.         0.15535499 0.84464501]
 [1.         0.40914424 0.59085576]
 [0.

In [6]:
predicted_valid = pred_model.predict(validaton[:,:-1], format='binary', mode='prediction')
print(validaton[:,:-1].sum(axis=1, keepdims=True))
print(validation_text.iloc[1,0])
to_print = 1
predicted_classes = pred_model.predict(validaton[:,:-1], format='text')[:,0]
summary = [(sentence, real, probs, pred) for sentence, real, probs, pred in zip(
    validation_text.iloc[:to_print,0], validation_text.iloc[:to_print,1],
    predicted_valid[:to_print], predicted_classes[:to_print])]
for row in summary:
    print(row)
pd.options.display.max_rows = 4000

# print(validation_text.iloc[:,1] != predicted_classes)
print(validation_text.iloc[[12, 24, 34, 35, 45, 50, 54, 57, 59, 62], 0])
print(predicted_valid[[12, 24, 34, 35, 45, 50, 54, 57, 59, 62]])


# print(validation_text.iloc[:to_print,0])

[[2.3143496 ]
 [1.21297539]
 [2.09113269]
 [2.70095464]
 [2.52559653]
 [2.70052424]
 [3.33448363]
 [2.30546539]
 [1.98636226]
 [2.6931265 ]
 [2.61999151]
 [2.0829796 ]
 [1.        ]
 [1.17156832]
 [3.83905888]
 [2.98381656]
 [1.83622777]
 [3.12933433]
 [1.84033701]
 [2.54059189]
 [3.98706728]
 [3.02722812]
 [2.96749098]
 [1.20862171]
 [1.        ]
 [2.08181867]
 [2.52867165]
 [3.32912702]
 [2.65543581]
 [2.08733221]
 [2.52559653]
 [1.82876013]
 [2.33228677]
 [1.84251133]
 [1.92115235]
 [2.22837802]
 [2.31419766]
 [2.09624589]
 [1.8322512 ]
 [3.85813721]
 [2.23264795]
 [2.30979182]
 [3.51138277]
 [1.53897951]
 [3.79725461]
 [1.20394691]
 [4.4006421 ]
 [2.1523268 ]
 [2.75923455]
 [1.84375488]
 [1.81551773]
 [2.94532365]
 [3.68849723]
 [1.57280557]
 [1.30663589]
 [2.95197948]
 [3.38614468]
 [1.        ]
 [3.48249704]
 [1.19331519]
 [1.66873927]
 [2.98262503]
 [1.        ]
 [1.83565865]
 [2.08806185]
 [3.49089352]
 [3.32496566]
 [3.0404625 ]
 [3.53677097]
 [1.        ]
 [1.95134556]]
 LED 