In [1]:
import json
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

### Load in datasets

In [2]:
# dataset (training)
data = pd.concat([pd.Series(json.loads(line)) for line in open('train.json')], axis=1).T

In [3]:
# testset
test = pd.concat([pd.Series(json.loads(line)) for line in open('dev.json')], axis=1).T

In [4]:
# excludes other columns for now
data = data[['text', 'lang']]
test = test[['text', 'lang']]

In [5]:
# shuffle dataset
data = data.reindex(np.random.permutation(data.index))

In [6]:
train_text = data['text'].values
train_y = data['lang'].values

test_text = test['text'].values
test_y = test['lang'].values

### Preprocess data

In [None]:
import re

smilies_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    smilies_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @ mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hashtags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

# re.VERBOSE allows for spaces to be ignored
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
smilies_re = re.compile(r'^'+smilies_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

# will return a list
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [None]:
for i in range(0, len(test)):
    # @ mentions
    test['text'][i] = re.sub(r'(?:@[\w_]+)', "", test['text'][3]).lstrip().rstrip()
    # urls
    test['text'][i] = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', "", test['text'][3]).lstrip().rstrip()
    # numbers
    test['text'][i] = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', "", test['text'][i])

### Extract Features

In [7]:
# pipeline_main will be the first to simply get the predictions
# and confidence scores initially.
# this pipeline makes use of uni-grams and bi-grams with bagging of words
# primarily useful SO FAR with the Latin alphabet (a,b,c,..,z)
# handles en, fr, etc.
pipeline_main = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# uni - tri grams with bagging of characters
# TRIAL - using for asian languages, JP, KO, CN
pipeline_sea = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3), analyzer=u'char')),
    ('classifier', MultinomialNB())
])

### Evaluation

In [8]:
# main
pipeline_main.fit(train_text, train_y)
predictions_main = pipeline_main.predict(test_text)

In [74]:
main_confidence = pipeline_main.predict_proba(test_text)

threshold = 0.9

new_text = []
new_y = []
new_test = []
new_test_y = []

index = 0
correct = 0
above = 0
for i in main_confidence:
    if (i.max() > threshold):
        above += 1
        if (predictions_main[index] == test_y[index]):
            # true positives
            correct += 1
    else:
        # try to find false negatives
        new_text.append(train_text[index])
        new_y.append(train_y[index])
        new_test.append(test_text[index])
        new_test_y.append(test_y[index])
    index += 1

#correct/above

In [101]:
#pipeline_sea.fit(new_text, new_y)
pipeline_sea.fit(train_text, train_y)
predictions_sea = pipeline_main.predict(new_test)

In [102]:
np.mean(predictions_sea == new_test_y)

0.20062695924764889

In [103]:
sea_confidence = pipeline_sea.predict_proba(new_test)

In [129]:
sea_confidence[4].max()

1.0

In [130]:
predictions_sea[4]

'en'

In [131]:
new_test_y[4]

'ar'

In [126]:
new_test[3]

'Весна!!!!'

In [108]:
test_text[17]

'Весна!!!!'

In [49]:
test_y[15]

'mr'

In [64]:
predictions_main[2960]

'en'

In [56]:
main_confidence[1445].max()

0.99779630566192523

In [132]:
test_text[2960]

'#emissiondemonenfance Scoubidou scooby-doo show generique fr http://t.co/m9IMMhO via @youtube\n'