Task: Predict topics from Blog Authorship Corpus

Columns description:  

* sign = categorical variable - you have to predict that for test dataset
* text  = blogpost text

Most of the reviews are in English, but there are a few in other languages.

Metric: categorical cross-entropy  (logloss)  
baseline: 1.922

In [1]:
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [2]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,sign,text
0,0,Aquarius,... I'll be right there waiti...
1,1,Sagittarius,"Munday Not such a bad monday, as mond..."
2,2,Aquarius,urlLink This Katie on Brandon's lap...
3,3,Virgo,Dammit SIAC really really sux...
4,4,Leo,So I was at church Sunday and I was hav...


In [4]:
df_train = train[["sign","text"]]
df_test = test[["sign","text"]]

In [5]:
df_train = df_train.rename(columns={'sign': 'target'})
df_test = df_test.rename(columns={'sign': 'target'})

In [6]:
df_train.head()

Unnamed: 0,target,text
0,Virgo,... rndm acronym I made up :...
1,Pisces,I am so pumped about my party tomorrow....
2,Sagittarius,Friday! Maybe it's the longer dayligh...
3,Leo,urlLink The story of the blue...
4,Aries,"'In August, the historic Shugboroug..."


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [11]:
df_train.target.unique()

array(['Virgo', 'Pisces', 'Sagittarius', 'Leo', 'Aries', 'Libra',
       'Capricorn', 'Scorpio', 'Gemini', 'Cancer', 'Aquarius', 'Taurus'], dtype=object)

In [12]:
vec = TfidfVectorizer(lowercase=True, analyzer='word', ngram_range=(1,1), norm=None, use_idf=False, binary=True)
X = vec.fit_transform(df_train.text)
print('feature matrix shape', X.shape)


# encode class labels
label_enc = LabelEncoder().fit(df_train.target)

y_train = label_enc.transform(df_train.target)

label_enc.classes_

feature matrix shape (9600, 57572)


array(['Aquarius', 'Aries', 'Cancer', 'Capricorn', 'Gemini', 'Leo',
       'Libra', 'Pisces', 'Sagittarius', 'Scorpio', 'Taurus', 'Virgo'], dtype=object)

In [35]:

# fit our prediction model
model = MLPClassifier(alpha=1)

model.fit(X, y_train)





MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [37]:
print('train', metrics.log_loss(y_train, model.predict_proba(X)))

# perfomance on test dataset
X_test = vec.transform(df_test.text)
y_pred = model.predict(X_test)
y_test = label_enc.transform(df_test.target)
print('test', metrics.log_loss(y_test, model.predict_proba(X_test)))

train 0.56924753333
test 1.73311294999


In [40]:
print('train', metrics.log_loss(y_train, model.predict_proba(X)))
print('train', metrics.log_loss(y_test, model.predict_proba(X_test)))

train 0.56924753333
train 1.73311294999
