In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
# test set
test = pd.concat([pd.Series(json.loads(line)) for line in open('dev.json')], axis=1).T

In [3]:
# data set
data = pd.concat([pd.Series(json.loads(line)) for line in open('train.json')], axis=1).T

In [None]:
# slicing
data = data.ix[:3702]

In [4]:
# excludes other columns for now
data = data[['text', 'lang']]
test = test[['text', 'lang']]

In [5]:
# shuffle dataset
data = data.reindex(np.random.permutation(data.index))

### Document Cleaning
- Remove links
- Remove usernames
- Remove smilies/emojis

In [36]:
# links and usernames - no improvement??

import re

for i in range(0, len(test)):
    test['text'][i] = re.sub(r"http\S+", "", test['text'][i])
    test['text'][i] = re.sub(r"@\S+", "", test['text'][i])
    test['text'][i] = re.sub(r"#\S+", "", test['text'][i])
    
for i in range(0, len(data)):
    data['text'][i] = re.sub(r"http\S+", "", data['text'][i])
    data['text'][i] = re.sub(r"@\S+", "", data['text'][i])
    data['text'][i] = re.sub(r"#\S+", "", data['text'][i])

In [37]:
test['text'][6]

'ja der sinndeslebens liee sich auf twitter übrigens auf globaler ebene sehr gut diskutieren meaningoflife'

### Extract Features

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['text'].values)
counts.shape

(37022, 1039132)

### Classify

In [9]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
targets = data['lang'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
examples = ['안녕 안녕 サム', '안녕하세요', 'test', 'francais', 'fuck', 'dick', 'trump', 'hi', 'sorry', 'lmao', 'french', 'こんにちは']
example_counts = count_vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['ko', 'ko', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'ja'], 
      dtype='<U2')

### Pipelining
Used to merge `feature extraction` and `classification` into one operation.

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3), analyzer=u'char')),
    ('classifier', MultinomialNB())
])

pipeline.fit(data['text'].values, data['lang'].values)
pipeline.predict(examples)

array(['ko', 'ko', 'fr', 'fr', 'de', 'en', 'es', 'en', 'en', 'en', 'en',
       'ja'], 
      dtype='<U2')

### Evaluation

In [13]:
train_text = data['text'].values
train_y = data['lang'].values

test_text = test['text'].values
test_y = test['lang'].values

In [21]:
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
np.mean(predictions == test_y)

0.73779527559055114

In [52]:
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
np.mean(predictions == test_y)

0.79988751406074243

In [57]:
print("Training score: {0:.1f}%".format(pipeline.score(train_text, train_y) * 100))

Training score: 96.3%


In [53]:
classifier_confidence = pipeline.predict_proba(test_text)

### Attempt at thresholding to block out "unk"

In [54]:
threshold = 0.7

accepted_confidence = []
test_pred = []

# change dtype of predictions otherwise its default set to <U2
predictions = predictions.astype('|S3')

index = 0
for i in classifier_confidence:
    if (i.max() < threshold):
        predictions[index] = 'unk'
    index += 1

In [70]:
classifier_confidence[10]

array([  8.52585183e-300,   0.00000000e+000,   2.36931854e-038,
         1.00000000e+000,   2.90737689e-045,   5.11873832e-302,
         2.69191605e-055,   7.82552822e-245,   0.00000000e+000,
         1.59865007e-082,   1.12133698e-246,   5.13654961e-243,
         2.06879445e-285,   0.00000000e+000,   1.76953483e-015,
         6.47347596e-285,   4.05826666e-280,   6.47005857e-284,
         0.00000000e+000,   3.75940966e-168])

In [71]:
test_y[10]

'unk'

In [72]:
predictions[10]

b'en'

In [55]:
correct = 0

for i in range(0, len(predictions)):
    if (predictions[i].decode('utf-8') == test_y[i]):
        correct += 1

In [56]:
# this is the new accuracy score when an attempt at blocking 'unk' was made
correct/len(predictions)

0.805511811023622

### Attempt with SVM

In [None]:
from sklearn.linear_model import SGDClassifier

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SGDClassifier(loss='hinge', penalty='l2',
                                 alpha=1e-3, n_iter=5, random_state=42))
])

pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
np.mean(predictions == test_y)

### Naive Bayes Attempt With tf & idf

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False)
X_train_tf = tf_transformer.fit_transform(counts)
X_train_tf.shape

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(counts)
X_train_tfidf.shape

In [None]:
### TF - Naive Bayes

'''
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])
'''

classifier = MultinomialNB().fit(X_train_tf, targets)

X_new_counts = count_vectorizer.transform(test_text)
X_new_tf = tf_transformer.transform(X_new_counts)

predictions = classifier.predict(X_new_tf)

np.mean(predictions == test_y)

In [None]:
### TFIDF - Naive Bayes

'''
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])
'''

classifier = MultinomialNB().fit(X_train_tfidf, targets)

X_new_counts = count_vectorizer.transform(test_text)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predictions = classifier.predict(X_new_tfidf)

np.mean(predictions == test_y)

### SVM Attempt with TF & TfIDF

In [None]:
classifier = SGDClassifier(loss='hinge', penalty='l2',
                                 alpha=1e-3, n_iter=5, random_state=42).fit(X_train_tf, targets)

X_new_counts = count_vectorizer.transform(test_text)
X_new_tf = tf_transformer.transform(X_new_counts)

predictions = classifier.predict(X_new_tf)

np.mean(predictions == test_y)

In [None]:
classifier = SGDClassifier(loss='hinge', penalty='l2',
                                 alpha=1e-3, n_iter=5, random_state=42).fit(X_train_tfidf, targets)

X_new_counts = count_vectorizer.transform(test_text)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predictions = classifier.predict(X_new_tfidf)

np.mean(predictions == test_y)