<a href="https://colab.research.google.com/github/joereuben/language-identification/blob/main/Multinomial_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report

In [26]:
def file2sentences(filename):
   
   txt = ""
   with open(filename,"r",encoding="utf-8") as f:
      txt = f.read()
 
   txt = txt.replace("?",".")
   txt = txt.replace("!",".")
   txt = txt.replace("»","")
   txt = txt.replace("«","")
   txt = txt.replace(":","")
   txt = txt.replace(";","")
   txt = txt.replace("...",".")
   txt = txt.replace("…",".")
   txt = txt.replace("\n",".")
   txt = txt.replace("  "," ")
   txt = txt.replace("\"","")
   txt = txt.replace("„","")
   sentences = txt.split(".")
   for i in range(len(sentences)):
      sentences[i] = sentences[i].strip()
      
   sentences = [x for x in sentences if x != ""]
   return sentences

In [27]:
italian = file2sentences("il fratello maggiore sta guardando.txt")
english = file2sentences("big brother is watching.txt")
german = file2sentences("großer bruder schaut zu.txt")
portuguese = file2sentences("irmão mais velho está assistindo.txt")

X = np.array(italian + english + german + portuguese)
y = np.array(['it']*len(italian) + ['en']*len(english) + ['de']*len(german) + ['pt']*len(portuguese))


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [29]:
cnt = CountVectorizer(analyzer = 'char',ngram_range=(2,2))

pipeline = Pipeline([
   ('vectorizer',cnt),  
   ('model',MultinomialNB())
])

In [30]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

In [31]:
confusion_matrix(y_test, y_pred)

array([[ 8,  0,  0],
       [ 0, 11,  0],
       [ 0,  0,  8]])

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          de       1.00      1.00      1.00         8
          en       1.00      1.00      1.00        11
          it       1.00      1.00      1.00         8

    accuracy                           1.00        27
   macro avg       1.00      1.00      1.00        27
weighted avg       1.00      1.00      1.00        27



In [37]:
languages = {"en":"English", "it":"Italian", "de":"German", "pt":"Portuguese"}

In [34]:
lang = pipeline.predict(["""Braverman lived in France for two years, as an Erasmus 
Programme student and then as an Entente Cordiale Scholar, where she completed 
a master's degree in European and French law at Panthéon-Sorbonne University. """])

print("This text is in", languages[lang[0]])

This text is in English


In [35]:
lang = pipeline.predict(["Braverman lebte zwei Jahre in Frankreich, als Studentin im Erasmus-Programm und dann als Stipendiatin der Entente Cordiale, wo sie einen Master-Abschluss in europäischem und französischem Recht an der Universität Panthéon-Sorbonne absolvierte"])
print("This text is in", languages[lang[0]])

This text is in German


In [38]:
lang = pipeline.predict(["Braverman morou na França por dois anos, como aluna do Programa Erasmus e depois como Entente Cordiale Scholar, onde completou um mestrado em direito europeu e francês na Universidade Panthéon-Sorbonne"])
print("This text is in", languages[lang[0]])

This text is in Portuguese
