In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [None]:
data = pd.read_csv("perfect_out2.csv")

In [None]:
data =  data[data["prob_threshold"] == 1.0]

In [None]:
from collections import Counter
def createIndexer():
  indexer = Counter()
  index = 0

  for tag in data["tag_name"]:
    # check if we have already given this tag name an index
    if tag not in indexer:
      # map tag to index
      indexer[tag] = index
      
      # increment the index for the next tag name
      index += 1
  
  return indexer
indexer = createIndexer()
indexer

In [None]:
x_data = data[["tag_name", "tag_prob"]]
y_data = data["tag_resp"]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

In [None]:
def convert1Hot(data):
  # new data will have first cat_count - 1 columns as one hot encoding 
  # for which kind of semantic unit we are dealing with
  # the last column will be the probabilit of the semantic unit
  new_data = np.zeros((data.shape[0], len(indexer) + 1))

  # go through all examples and add 1 where relavant
  for idx in range(data.shape[0]):
    # copying the probability
    new_data[idx, -1] = data[idx, 1]

    # column index
    col_idx = indexer[data[idx, 0]]

    # setting the one hot encoding
    new_data[idx, col_idx] = 1
  
  return new_data

In [None]:
x_train, y_train = x_train.to_numpy(), y_train.to_numpy()
x_test, y_test = x_test.to_numpy(), y_test.to_numpy()

y_train = np.reshape(y_train, (y_train.shape[0], 1))
y_test = np.reshape(y_test, (y_test.shape[0], 1))

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
#clf = clf.fit(X_train.to_numpy().reshape(-1,1), y_train)
clf = clf.fit(x_train, y_train)
y_pred= clf.predict(x_test)


In [None]:
log_loss(y_test,y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
from sklearn import svm
clf = svm.SVC()
#clf.fit(X_train.to_numpy().reshape(-1,1), y_train)
clf.fit(x_train, y_train)
y_pred= clf.predict(x_test)

In [None]:
log_loss(y_test,y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
#logisticRegr.fit(X_train.to_numpy().reshape(-1,1), y_train)
logisticRegr.fit(x_train, y_train)
y_pred = logisticRegr.predict(x_test)

In [None]:
log_loss(y_test,y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#clf = gnb.fit(X_train.to_numpy().reshape(-1,1), y_train)
clf = gnb.fit(x_train, y_train)
y_pred= clf.predict(x_test)

In [None]:
log_loss(y_test,y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
#neigh = neigh.fit(X_train.to_numpy().reshape(-1,1), y_train)
neigh = neigh.fit(x_train, y_train)
y_pred = neigh.predict(x_test)

In [None]:
log_loss(y_test,y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
# plot feature importance manually
from numpy import loadtxt
from xgboost import XGBClassifier
from matplotlib import pyplot
# load data
#dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
#X = dataset[:,0:8]
#y = dataset[:,8]
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
# feature importance
print(model.feature_importances_)
# plot
pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
pyplot.show()

In [None]:
from xgboost import plot_importance
plot_importance(model)
pyplot.show()