In [93]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [94]:
# Load data first

df = pd.read_csv('lang_data.csv', header=0, delimiter=',')
df

Unnamed: 0,text,language
0,Ship shape and Bristol fashion,English
1,Know the ropes,English
2,Graveyard shift,English
3,Milk of human kindness,English
4,Touch with a barge-pole - Wouldn't,English
...,...,...
2834,Daar’s ‘n geurtjie aan.,Afrikaans
2835,Men's evil manners live in brass; their virtue...,English
2836,Go-faster,English
2837,Red tape,English


In [95]:
# Clean empty text

df = df[df.text.notnull()]
df

Unnamed: 0,text,language
0,Ship shape and Bristol fashion,English
1,Know the ropes,English
2,Graveyard shift,English
3,Milk of human kindness,English
4,Touch with a barge-pole - Wouldn't,English
...,...,...
2834,Daar’s ‘n geurtjie aan.,Afrikaans
2835,Men's evil manners live in brass; their virtue...,English
2836,Go-faster,English
2837,Red tape,English


In [104]:
# Make binary separation of languages and clean everything with null

def label_lang(row):
    language_str = row['language']
    if language_str == 'English':
        return 0
    if language_str == 'Afrikaans':
        return 1
    if language_str == 'Nederlands':
        return 2

    return None

df['language_bin'] = df.apply(lambda row: label_lang(row) , axis=1)
df = df[df.language_bin.notnull()]
df

Unnamed: 0,text,language,language_bin,text_preproc
0,Ship shape and Bristol fashion,English,0,ship shape and bristol fashion
1,Know the ropes,English,0,know the ropes
2,Graveyard shift,English,0,graveyard shift
3,Milk of human kindness,English,0,milk of human kindness
4,Touch with a barge-pole - Wouldn't,English,0,touch with barge pole wouldn t
...,...,...,...,...
2834,Daar’s ‘n geurtjie aan.,Afrikaans,1,daar n geurtjie aan
2835,Men's evil manners live in brass; their virtue...,English,0,men evil manners live in brass their virtues w...
2836,Go-faster,English,0,go faster
2837,Red tape,English,0,red tape


In [105]:
# Make text preprocessing and clean everything with null 

def preprocess_text(row):
    result_text = row['text']
    
    # Remove all the special characters
    result_text = re.sub(r'\W', ' ', result_text)
    
    # remove all single characters
    result_text = re.sub(r'\s+[a-zA-Z]\s+', ' ', result_text)
    
    # Remove single characters from the start
    result_text = re.sub(r'\^[a-zA-Z]\s+', ' ', result_text) 
    
    # Substituting multiple spaces with single space
    result_text = re.sub(r'\s+', ' ', result_text, flags=re.I)
    
    # Removing prefixed 'b'
    result_text = re.sub(r'^b\s+', '', result_text)
    
    # Converting to Lowercase
    result_text = result_text.lower()
    
    return result_text

df['text_preproc'] = df.apply(lambda row: preprocess_text(row) , axis=1)
df =  df[df.text_preproc.notnull()]
df

Unnamed: 0,text,language,language_bin,text_preproc
0,Ship shape and Bristol fashion,English,0,ship shape and bristol fashion
1,Know the ropes,English,0,know the ropes
2,Graveyard shift,English,0,graveyard shift
3,Milk of human kindness,English,0,milk of human kindness
4,Touch with a barge-pole - Wouldn't,English,0,touch with barge pole wouldn t
...,...,...,...,...
2834,Daar’s ‘n geurtjie aan.,Afrikaans,1,daar n geurtjie aan
2835,Men's evil manners live in brass; their virtue...,English,0,men evil manners live in brass their virtues w...
2836,Go-faster,English,0,go faster
2837,Red tape,English,0,red tape


In [106]:
# Transform string to number representation

tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
words_bin = tfidfconverter.fit_transform(df['text_preproc'].to_numpy()).toarray()
categories = df['language_bin'].to_numpy()

In [107]:
# Split data

X_train, X_test, y_train, y_test = train_test_split(words_bin, categories, test_size=0.2, random_state=0)

In [108]:
# Train data using RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [109]:
# Predict data

y_pred = classifier.predict(X_test)

In [110]:
# Get results

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[408   3   0]
 [  4 126   0]
 [  3   2   7]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       411
           1       0.96      0.97      0.97       130
           2       1.00      0.58      0.74        12

    accuracy                           0.98       553
   macro avg       0.98      0.85      0.90       553
weighted avg       0.98      0.98      0.98       553

0.9783001808318263


In [111]:
# As we can see, we got about 98% accuracy. Perfect!