Here, we will investigate how good a score can be achieved using character 9-grams.

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import sys
import random
import math
import gc
from scipy.stats import norm
from nltk import FreqDist, ngrams, sent_tokenize, word_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn import svm
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, ParameterGrid, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
import _pickle as pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        p = pickle.Pickler(output) 
        p.fast = True 
        p.dump(obj)
        
def load_object(filename):
    with open(filename, 'rb') as f:
        x = pickle.load(f)
    return(x)

# Load the data

In [2]:
def char_distribution(m, lowerfreqlimit, training, LANGUAGES):
    """Calculate the char m grams distribution.
    @m: consider k-grams up to and including m for characters.
    @lowerfreqlimit: number below which we consider words misspellings, odd words out or unique.
    @training: training data to retrieve the language distribution from.
    @LANGUAGES: languages based on which we classify. Either native languages or "non-native"/"native" divide is possible.
    """
    
    char_dist = {}

    for language in LANGUAGES:
        char_dist[language] = dict(zip(range(1, m+1), [FreqDist() for i in range(1, m+1)]))
    
    for k in range(1, m+1):
        for text in training.iteritems():
            for sentence in sent_tokenize(text[1]):
                
                # Note, for any gram, there exist 2 subgrams of all but the first and all of the last element. Let us
                # only update the dictionary if the total count of these subgrams exceeds the lower limit. This prevents
                # an unnecessary combinatorial explosion.                
                
                for gram in ngrams(sentence,k):
                    if k == 1: 
                        char_dist[language][k][gram] += 1
                    elif char_dist[language][k-1].get(gram[1:],0)+char_dist[language][k-1].get(gram[:-1],0) > 2*lowerfreqlimit:
                        char_dist[language][k][gram] += 1
                        
        print("Completed counting all {}-grams".format(k))
                                               
    return char_dist

In [None]:
training = pd.read_csv("python_data/train",sep="\t",error_bad_lines=False,encoding="utf-8")
validation = pd.read_csv("python_data/development",sep="\t",error_bad_lines=False,encoding="utf-8")

In [None]:
char_dis = char_distribution(10, 20, training.text_clean, training.native_lang.unique())
save_object(char_dis,"trained_char_dis_10")