In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
import csv
import seaborn as sns
from sklearn_evaluation import plot
from sklearn import preprocessing

# classifier
import sklearn
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score

# Import data

In [None]:
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    test = list(reader)
test = [element[0].split(" ") for element in test]

In [None]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    train = list(reader)
train = [element[0].split(" ") for element in train]
label = [int(element[2]) for element in train]

In [None]:
train_features = np.loadtxt('train_features.txt', dtype=np.float64)
test_features = np.loadtxt('test_features.txt', dtype=np.float64)

In [None]:
new_features_train = np.delete(train_features,[-1,-4],axis=1)
new_features_test = np.delete(test_features,[-1,-4],axis=1)

In [None]:
train_features_scaled = preprocessing.scale(new_features_train) 
test_features_scaled = preprocessing.scale(new_features_test)

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(train_features_scaled, label, test_size=0.20, random_state=42)

# Classifier

### Feature Importance

In [None]:
classifier = rfc(n_jobs=1, n_estimators=700, criterion="gini", min_samples_split=10,
                 min_samples_leaf=2, max_features="sqrt", max_depth=10)
classifier.fit(X_train, y_train)
feature_importance = list(zip(X_train, classifier.feature_importances_))
predictions_classifier = list(classifier.predict(X_test))
predictions_classifier = zip(range(len(test)), predictions_classifier)

In [None]:
feature_names = ['Overlapping words in titles', 'Temporal distance between papers', 'Number of common authors',
                 'Overlapping words in journal', 'Overlapping words in abstract', 'Cosine similarity of abstract',
                 'Cosine similarity of author', 'Cosine similarity of journal', 'Cosine similarity of title',
                 'Common neighbours', 'Preferential attachment', 'Jaccard similarity', 'Adamic Adar similarity',
                 'Pagerank from source', 'Pagerank from target']

In [None]:
my_dpi = 96
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (11.69, 8.27)
plot.feature_importances(classifier, feature_names=feature_names)
# plt.xlabel('Feature Names')
plt.ylabel('Feature Importance Score (%)')
# plt.title('Features Importance')
plt.gca().xaxis.set_minor_formatter(ticker.NullFormatter())
plt.xticks(rotation=90)
f = plt.gcf()
f.subplots_adjust(bottom=0.4)
plt.show()

### Classifier Comparison

In [None]:
classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier(n_jobs=1, n_estimators=500, criterion="entropy", max_features="log2", max_depth=10),
    ExtraTreesClassifier(),
    BaggingClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    MLPClassifier()]

In [None]:
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

clf_number = 0
for clf in classifiers:
    clf_number = clf_number + 1
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    print(clf_number, ". ", name, 'result: ')   
    train_pred = clf.predict(X_test)
    acc = f1_score(y_test, train_pred)
    print ("Accuracy: {:.4%}".format(acc))

    train_pred = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_pred)
    print ("Log Loss: {}".format(ll))

    log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols)
    log = log.append(log_entry)

In [None]:
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="g")

plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.savefig("clf-accuracy.pdf")
plt.show()

In [None]:
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=log, color="g")

plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.savefig("clf-loss.pdf")
plt.show()

### Final Submition

In [None]:
# MLPClassifier
model = MLPClassifier()
model.fit(train_features_scaled, label)
pred_test = model.predict(test_features_scaled)

In [None]:
pred = [int(i) for i in pred_test]

In [None]:
submission = pd.DataFrame(pred)
submission.to_csv(
    path_or_buf="MLP.csv",
    index=True,
    index_label="id",
    header=["category"]
)

In [None]:
train_shortest_paths = pd.read_csv('Features/train_shortest_paths.csv')
test_shortest_paths = pd.read_csv('Features/test_shortest_paths.csv')