### Import Packages

In [10]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SpanishStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from nltk.corpus import stopwords
from scipy.sparse import hstack

### Load Data

In [7]:
train_data = pd.read_excel('train_universidad.xlsx',sheetname=1)
test_data = pd.read_excel('test_universidad.xlsx')

### Get Columns

In [8]:
train_data.columns

Index(['COD_ENCUESTADO', 'Nombre Campus', 'NIVEL ACTUAL', 'Clave de carrera',
       'Ciclo', 'COMENTARIO', 'IND_GEA', 'IND_DELEGADO',
       'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista',
       'NPS'],
      dtype='object')

### First 5 rows

In [9]:
train_data.head()

Unnamed: 0,COD_ENCUESTADO,Nombre Campus,NIVEL ACTUAL,Clave de carrera,Ciclo,COMENTARIO,IND_GEA,IND_DELEGADO,CANT_CURSOS_MATRICU_SIN_INGLES,UOD_depostista_ind_deportista,NPS
0,13501,1,AC,3,2,Me gusta la u es paja bro y soy pito,,Delegado,6.0,,3
1,23622,3,AC,25,1,El metodo de blended no le hace bien a todos,,Delegado,5.0,,3
2,8354,4,AC,31,1,"Los profesores, sus métodos de enseñanza bes ...",,,5.0,,4
3,17745,4,AC,28,6,Porque posee gran mayoría de profesores espec...,GEA,,6.0,,3
4,10867,3,AC,34,1,La pencion,,,6.0,,3


### Append Data

In [11]:
all_data = train_data.append(test_data)

### Adding 'Comentario' length

In [12]:
all_data['COMENTARIO_LEN'] =all_data['COMENTARIO'].str.len()

### Columns Sets

In [13]:
data_cols= ['NIVEL ACTUAL',
       'Ciclo', 'COMENTARIO', 'COMENTARIO_LEN', 'IND_GEA', 'IND_DELEGADO',
       'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista']

model_cols= ['NIVEL ACTUAL',
       'Ciclo', 'IND_GEA', 'IND_DELEGADO',
       'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista']

model_log_cols= ['NIVEL ACTUAL',
       'Ciclo', 'COMENTARIO_LEN','IND_GEA', 'IND_DELEGADO',
       'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista']

### Cleaning Data

In [14]:
all_data['IND_GEA'] = all_data['IND_GEA'].map({'IND_GEA':1}).fillna(0)
all_data['IND_DELEGADO'] = all_data['IND_DELEGADO'].map({'Delegado':1}).fillna(0)
all_data['UOD_depostista_ind_deportista'] = all_data['UOD_depostista_ind_deportista'].map({'Deportista':1}).fillna(0)
all_data['CANT_CURSOS_MATRICU_SIN_INGLES'] = all_data['CANT_CURSOS_MATRICU_SIN_INGLES'].fillna(4)

### Encoding 'Nivel Actual'

In [15]:
le = LabelEncoder()

In [16]:
all_data['NIVEL ACTUAL'] = le.fit_transform(all_data['NIVEL ACTUAL'])

In [17]:
all_data = all_data.reset_index(drop=True)

### Removing stopwords, punctuation, stemming.

In [18]:
stop = stopwords.words('spanish')
stemmer = SpanishStemmer()

In [19]:
pretable = dict.fromkeys(string.punctuation)
table = str.maketrans(pretable)

In [20]:
all_data['COMENTARIO'] = all_data['COMENTARIO'].apply(lambda x: ' '.join([word.translate(table) for word in x.split() if word not in stop]))

In [21]:
all_data['COMENTARIO'] = all_data['COMENTARIO'].str.\
    replace('enseñansa','enseñanza').\
    replace('pencion','pension')

In [22]:
all_data['COMENTARIO'] = all_data['COMENTARIO'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop]))

In [23]:
all_data.head()

Unnamed: 0,CANT_CURSOS_MATRICU_SIN_INGLES,COD_ENCUESTADO,COMENTARIO,Ciclo,Clave de carrera,IND_DELEGADO,IND_GEA,NIVEL ACTUAL,NPS,Nombre Campus,UOD_depostista_ind_deportista,COMENTARIO_LEN
0,6.0,13501,me gust u paj bro pit,2,3,1.0,0.0,0,3.0,1,0.0,37
1,5.0,23622,el metod blend hac bien,1,25,1.0,0.0,0,3.0,3,0.0,45
2,5.0,8354,los profesor metod enseñ bes buen el uso tecno...,1,31,0.0,0.0,0,4.0,4,0.0,168
3,6.0,17745,porqu pose gran mayor profesor especializ enseñ,6,28,0.0,0.0,0,3.0,4,0.0,73
4,6.0,10867,la pencion,1,34,0.0,0.0,0,3.0,3,0.0,11


### Split Data

In [24]:
X = all_data.loc[:19999,data_cols]
y = all_data.loc[:19999,'NPS']
X_final = all_data.loc[20000:,data_cols]

In [25]:
x_train,x_test, y_train, y_test = train_test_split(X,y, test_size=0.1,random_state=1)

### Vectorize 'Comentarios'

In [26]:
vec = TfidfVectorizer(ngram_range=(1,3), min_df=0.001, max_df=0.6,strip_accents='unicode')

In [27]:
vec

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=0.001,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
vec.fit(x_train['COMENTARIO'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=0.001,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [29]:
vec.get_feature_names()

['10',
 '100',
 '11',
 'abastec',
 'abren',
 'abrir',
 'acab',
 'academ',
 'acced',
 'acces',
 'accesibil',
 'acept',
 'acerc',
 'acondicion',
 'acord',
 'acredit',
 'acredit wasc',
 'activ',
 'actual',
 'actualiz',
 'acuerd',
 'adecu',
 'adem',
 'ademas',
 'adicional',
 'administr',
 'administracion',
 'admision',
 'adquir',
 'afect',
 'agrad',
 'agrand',
 'agreg',
 'ahi',
 'ahor',
 'air',
 'air acondicion',
 'al',
 'algui',
 'algun',
 'algun curs',
 'algun profesor',
 'almorz',
 'almuerz',
 'alt',
 'alta',
 'alta calid',
 'altas',
 'alto',
 'alto nivel',
 'altos',
 'alumn',
 'alumn aprend',
 'alumn buen',
 'alumn campus',
 'alumn deb',
 'alumn epe',
 'alumn hac',
 'alumn mal',
 'alumn mas',
 'alumn matricul',
 'alumn mejor',
 'alumn pesim',
 'alumn profesor',
 'alumn pued',
 'alumn sed',
 'alumn sol',
 'alumn univers',
 'ambient',
 'ambient estudi',
 'ambit',
 'amig',
 'ampli',
 'ampliacion',
 'ano',
 'anos',
 'anterior',
 'antigu',
 'apart',
 'apertur',
 'aplic',
 'apoy',
 'apoy alu

In [30]:
x_train_tokens = vec.transform(x_train['COMENTARIO'])

In [31]:
x_test_tokens = vec.transform(x_test['COMENTARIO'])

### Stacking together tokens and categorical features

In [32]:
full_x_train = hstack((x_train[model_cols].as_matrix(),x_train_tokens))
full_x_test = hstack((x_test[model_cols].as_matrix(),x_test_tokens))

### Voting Classifier

In [33]:
clf_log2 = LogisticRegression(C= 1, class_weight= None, solver= 'newton-cg', random_state=1)
clf_xgb = XGBClassifier( objective='multi:softprob', scale_pos_weight=1, 
                        max_depth= 9,gamma=0.3, colsample_bytree= 0.9, subsample= 0.8,seed=27)
clf_nb2 = OneVsRestClassifier(MultinomialNB())

In [36]:
clf_voting = VotingClassifier(estimators=[('lr',clf_log2),('xgb', clf_xgb),('nb',clf_nb2)], voting='soft')

In [37]:
clf_voting.fit(full_x_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)), ('xgb', XGBClassifi...assifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1))],
         n_jobs=1, voting='soft', weights=None)

In [39]:
predict_clf_voting = clf_voting.predict(full_x_test)

In [42]:
print('accuracy: %s'  % accuracy_score(predict_clf_voting,y_test))
print('log_loss: %s'  % log_loss(y_test, clf_voting.predict_proba(full_x_test)))

accuracy: 0.7145
log_loss: 0.735936697571


### Results for submission

In [43]:
x_final_tokens = vec.transform(X_final['COMENTARIO'])

In [44]:
final_x_test = hstack((X_final[model_cols].as_matrix(),x_final_tokens))

In [45]:
final_predict = clf_voting.predict_proba(final_x_test)

In [46]:
final_cod = all_data.loc[20000:,'COD_ENCUESTADO'].copy().reset_index(drop=True)

In [47]:
final_predict_df = pd.concat([final_cod,pd.DataFrame(final_predict,columns = ['NPS1','NPS2','NPS3','NPS4'])],axis=1)

In [49]:
final_predict_df.to_csv('submission.csv',index=False)