In [2]:
import numpy as np
import string
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score,precision_score,recall_score,classification_report
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from numpy import genfromtxt
import csv
from random import sample
import re

In [3]:
# Reading cvs file
lang_data = pd.read_csv("lang_data.csv")

In [4]:
#display the first 10 observations
lang_data.head(20)

Unnamed: 0,text,language
0,Ship shape and Bristol fashion,English
1,Know the ropes,English
2,Graveyard shift,English
3,Milk of human kindness,English
4,Touch with a barge-pole - Wouldn't,English
5,Sy kan altyd my battery natpiepie.,Afrikaans
6,When the shit hits the fan,English
7,,Afrikaans
8,Egg on,English
9,Drag race,English


In [5]:
#describe the dataset: we notice there are 78 NAN texts messages
lang_data.describe()

Unnamed: 0,text,language
count,2761,2839
unique,2752,3
top,The law is an ass,English
freq,2,2077


In [6]:
#describe the dataset by each language
lang_data.groupby('language').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Afrikaans,639,638,So maklik soos brood en botter.,2
English,2055,2047,As different as chalk and cheese,2
Nederlands,67,67,Gedane zaken nemen geen keer.,1


In [7]:
# Delete the observations with empty "text"(text=NAN): 2761 phrases in cleaned dataset
lang_data1=lang_data.dropna(axis=0)


In [8]:
#2761 text messages with 9 duplicates
lang_data1.describe()
#Remove duplicate text messages
lang_data1=lang_data1.drop_duplicates(subset='text')

In [9]:
## Duplicates and NAN removed
lang_data1.describe()

Unnamed: 0,text,language
count,2752,2752
unique,2752,3
top,Against the grain,English
freq,1,2047


In [19]:
#function to clean dataset by removing punctuation marks (tokenization)
def text_process(mess):
    nopunc=[c for c in mess if c not in string.punctuation]
    nopunc=''.join(nopunc)
    #return list of clean words
    return nopunc.split()


In [20]:
## Creating a Bag-of-words using CountVectorizer
bow_transformer=CountVectorizer(analyzer=text_process).fit(lang_data1['text'])

In [21]:
## Our vocabulary contains 5157 words
print(len(bow_transformer.vocabulary_))


5157


In [53]:
#Quickly view the contents of text 4; lang_data1['text'][3]
lang4=lang_data1['text'][3]
print(lang4)

Milk of human kindness


In [56]:
bow4=bow_transformer.transform([lang4])
print(bow4)

  (0, 968)	1
  (0, 3103)	1
  (0, 3238)	1
  (0, 3764)	1


In [62]:
bow_transformer.get_feature_names()[968]

'Milk'

In [22]:
## create countVectorizer for the entire dataset; 2752 * 5157 sparse matrix
lang_bow=bow_transformer.transform(lang_data1['text'])

In [23]:
## add weights to the bagwords using tf-idf transform 
tfidf_transform=TfidfTransformer().fit(lang_bow)


In [75]:
## check out the tdidf of text 4
tdidf4=tfidf_transform.transform(bow4)
print(tdidf4)

  (0, 3764)	0.24981968765
  (0, 3238)	0.568222529394
  (0, 3103)	0.540218879567
  (0, 968)	0.568222529394


In [24]:
## convert our bag of words to a tfidf matrix
lang_tfidf=tfidf_transform.transform(lang_bow)

In [25]:
## Oversample imbalanced dataset using SMOTE 
oversample_model=SMOTE()

In [27]:
X_res,Y_res=oversample_model.fit_sample(lang_tfidf,lang_data1['language'])


2047

In [39]:
## Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_res,Y_res, test_size=0.33)


In [40]:
lang_model=MultinomialNB().fit(X_train,y_train)

In [41]:
lang_pred=lang_model.predict(X_test)

In [42]:
## create a pipeline of processing and classification steps
#pipeline=Pipeline([        
#        ('bow',CountVectorizer(analyzer=text_process)),
#        ('tfidf',TfidfTransformer()),        
#        ('classifier',MultinomialNB())
#    ])

In [43]:
#pipeline.fit(X_train,y_train)

In [44]:
#lang_pred=pipeline.predict(X_test)

In [45]:
print(classification_report(y_test,lang_pred))

             precision    recall  f1-score   support

  Afrikaans       1.00      1.00      1.00       662
    English       1.00      0.99      1.00       659
 Nederlands       0.99      1.00      1.00       706

avg / total       1.00      1.00      1.00      2027



In [46]:
#Save model
# save the model to disk
filename = 'Language_model.sav'
joblib.dump(lang_model, filename)

df = pd.DataFrame(lang_model.coef_)
df.to_csv("Modelcoefficients.csv")
