## General imports

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer

# Preparing training dataset

In [3]:
dataset_path = './dataset.csv'
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,code,knowledgeArea,category,bigKnowledgeArea,Artículos publicados sin aprobar,Libros publicados sin aprobar,Capítulos de libro publicados sin aprobar,Documentos de trabajo sin aprobar,Otra publicación divulgativa sin aprobar,Otros artículos publicados sin aprobar,...,Eventos Artísticos,Talleres de Creación,Asesorías al Programa Ondas,Curso de Corta Duración Dictados,Trabajos dirigidos/turorías,Jurado/Comisiones evaluadoras de trabajo de grado,Participación en comités de evaluación,Demás trabajos,Proyectos,"Producción en arte, arquitectura y diseño"
0,COL0165935,"Ingenierías Eléctrica, Electrónica e Informática",B,Ingeniería y Tecnología,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24,0
1,COL0128369,Derecho,C,Ciencias Sociales,5,0,11,0,17,6,...,0,0,0,0,7,0,0,0,5,0
2,COL0176652,Ciencias de la Educación,C,Ciencias Sociales,0,0,0,0,0,0,...,0,0,0,0,31,0,0,0,4,0
3,COL0160634,Medicina Clínica,B,Ciencias Médicas y de la Salud,0,0,3,0,0,0,...,0,0,0,0,5,0,0,0,2,0
4,COL0082494,Medicina Clínica,B,Ciencias Médicas y de la Salud,20,0,19,0,1,0,...,0,0,0,0,25,0,0,0,69,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5198 entries, 0 to 5197
Columns: 102 entries, code to Producción en arte, arquitectura y diseño
dtypes: int64(98), object(4)
memory usage: 4.0+ MB


## Missing values

No missing values this time

## Categorical features

Now we are going to replace categorical features for numerical values

In [6]:
categorical_fields = ['knowledgeArea', 'bigKnowledgeArea', 'category']
for field in categorical_fields:
    df[field] = df[field].astype('category')
    df[field] = df[field].cat.codes
df.head()

Unnamed: 0,code,knowledgeArea,category,bigKnowledgeArea,Artículos publicados sin aprobar,Libros publicados sin aprobar,Capítulos de libro publicados sin aprobar,Documentos de trabajo sin aprobar,Otra publicación divulgativa sin aprobar,Otros artículos publicados sin aprobar,...,Eventos Artísticos,Talleres de Creación,Asesorías al Programa Ondas,Curso de Corta Duración Dictados,Trabajos dirigidos/turorías,Jurado/Comisiones evaluadoras de trabajo de grado,Participación en comités de evaluación,Demás trabajos,Proyectos,"Producción en arte, arquitectura y diseño"
0,COL0165935,27,2,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24,0
1,COL0128369,16,3,3,5,0,11,0,17,6,...,0,0,0,0,7,0,0,0,5,0
2,COL0176652,12,3,3,0,0,0,0,0,0,...,0,0,0,0,31,0,0,0,4,0
3,COL0160634,30,2,1,0,0,3,0,0,0,...,0,0,0,0,5,0,0,0,2,0
4,COL0082494,30,2,1,20,0,19,0,1,0,...,0,0,0,0,25,0,0,0,69,0


# Splitting the dataset

In [8]:
features = df.drop(columns=['code', 'category'])
labels = df['category']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

In [81]:
y_train.head()

1884    1
2205    2
3283    2
1510    3
1111    3
Name: category, dtype: int8

In [9]:
scaler = StandardScaler().fit(X_train)


# Training the models

## Simple Decision tree

In [14]:
clf = tree.DecisionTreeClassifier()
X_train = scaler.transform(X_train)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [15]:
print(list(zip(X_train.columns, clf.feature_importances_)))

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [84]:
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=X_train.columns, 
                                class_names=['No Survived', 'survived'], 
                                filled=True, rounded=True,  
                                special_characters=True) 
graph = graphviz.Source(dot_data) 
graph

IndexError: list index out of range

In [85]:
test_score = clf.score(X_train, y_train)
print('Decision tree training score:', round(test_score * 100, 4))

Decision tree training score: 100.0


In [86]:
val_score = clf.score(X_val, y_val)
print('Score on validation set', round(val_score * 100, 4))

Score on validation set 38.4615


## Random Forest

In [16]:
from sklearn import ensemble

In [17]:
clf = ensemble.RandomForestClassifier(n_estimators=50, max_depth=10)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
acc = clf.score(X_train, y_train)
print('Random forest training score:', round(acc * 100, 4))

Random forest training score: 71.3854


In [19]:
val_score = clf.score(X_val, y_val)
print('Score on validation set', round(val_score * 100, 4))

Score on validation set 13.2051


## Logistic regression

In [22]:
from sklearn import linear_model

In [27]:
lb = LabelBinarizer()
lb.fit(y_train)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [32]:
clf = linear_model.LogisticRegression(multi_class='multinomial')
clf.fit(X_train, y_train)

ValueError: Solver liblinear does not support a multinomial backend.

In [57]:
acc = clf.score(X_train, y_train)
print('Logistic regression training score', round(acc * 100, 4))

Logistic regression training score 49.945


In [58]:
val_score = clf.score(X_val, y_val)
print('Score on validation set', round(val_score * 100, 4))

Score on validation set 50.8974


## Support vector machine

[sklearn flow chart](http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)  chart recommends to use Support Vector Machines

In [90]:
from sklearn import svm

clf = svm.LinearSVC()
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [91]:
acc = clf.score(X_train, y_train)
print('SVM training score:', round(acc * 100, 4))

SVM training score: 42.4959


In [92]:
val_score = clf.score(X_val, y_val)
print('Score on validation set', round(val_score * 100, 4))

Score on validation set 38.7179
