In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gitanjalinambiar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gitanjalinambiar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#import the libraries
import time
import pandas as pd
import numpy as np
import json
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
#load csv file
data = pd.read_csv("sentences.csv")
print(data.head(10))

   id lan_code                sentence
0   1      cmn                  我們試試看！
1   2      cmn                 我该去睡觉了。
2   3      cmn                 你在干什麼啊？
3   4      cmn                  這是什麼啊？
4   5      cmn  今天是６月１８号，也是Muiriel的生日！
5   6      cmn           生日快乐，Muiriel！
6   7      cmn          Muiriel现在20岁了。
7   8      cmn           密码是"Muiriel"。
8   9      cmn                我很快就會回來。
9  10      cmn                   我不知道。


In [5]:
# Load JSON reference
with open('lan_to_language.json') as json_file:
    replacement_data = json.load(json_file)

In [6]:
# Replace values in the second column based on JSON reference
column_to_replace = 'lan_code'  # Change this to the actual name of your second column
for key, value in replacement_data.items():
    data.loc[data[column_to_replace] == key, column_to_replace] = value

In [7]:
# Save the modified DataFrame back to CSV
data.to_csv('modified_sentence.csv', index=False)

In [8]:
##cleaning
# removing punctuations
column_to_clean='sentence'
data[column_to_clean] = data[column_to_clean].str.translate(str.maketrans("", "", string.punctuation))

In [9]:
#removing numbers
data[column_to_clean] = data[column_to_clean].astype(str).str.replace('\d+', '', regex=True)

In [10]:
#removing special characters
data[column_to_clean] = data[column_to_clean].str.replace('[^a-zA-Z\s]', '', regex=True)

In [11]:
#lemmatization
lemmatizer = WordNetLemmatizer()
data[column_to_clean] = data[column_to_clean].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [12]:
#removing null values
data = data.dropna(subset=[column_to_clean])

In [13]:
data = pd.read_csv("modified_sentence.csv")

In [14]:
print(data.head(10))

   id          lan_code                sentence
0   1  Mandarin Chinese                  我們試試看！
1   2  Mandarin Chinese                 我该去睡觉了。
2   3  Mandarin Chinese                 你在干什麼啊？
3   4  Mandarin Chinese                  這是什麼啊？
4   5  Mandarin Chinese  今天是６月１８号，也是Muiriel的生日！
5   6  Mandarin Chinese           生日快乐，Muiriel！
6   7  Mandarin Chinese          Muiriel现在20岁了。
7   8  Mandarin Chinese           密码是"Muiriel"。
8   9  Mandarin Chinese                我很快就會回來。
9  10  Mandarin Chinese                   我不知道。


In [15]:
#sum() gives the count of all the null values
#count() gives the count non-null values
data.isnull().sum()

id          0
lan_code    0
sentence    0
dtype: int64

In [16]:
data.drop_duplicates('sentence')

Unnamed: 0,id,lan_code,sentence
0,1,Mandarin Chinese,我們試試看！
1,2,Mandarin Chinese,我该去睡觉了。
2,3,Mandarin Chinese,你在干什麼啊？
3,4,Mandarin Chinese,這是什麼啊？
4,5,Mandarin Chinese,今天是６月１８号，也是Muiriel的生日！
...,...,...,...
10341807,10794524,Spanish,Quiero este libro por favor.
10341808,10794525,Spanish,Los han hecho huir.
10341809,10794526,Spanish,Los botaron.
10341810,10794527,Spanish,Los hicieron correr.


In [17]:
data["lan_code"].value_counts()

lan_code
English             1586621
Russian              909951
Italian              805104
Turkish              717897
Esperanto            685643
                     ...   
Southern Haida            1
Rendille                  1
Louisiana Creole          1
Nyunga                    1
Cuyonon                   1
Name: count, Length: 404, dtype: int64

In [18]:
x=np.array(data["sentence"])
y=np.array(data["lan_code"])

cv=CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=42) 
         

In [19]:
start_time = time.time()

In [20]:
model=PassiveAggressiveClassifier(max_iter=1000, random_state=42)
model.fit(X_train,y_train)
predictions=model.predict(X_test)
accuracy=accuracy_score(y_test,predictions)

In [21]:
accuracy

0.9641505298584915

In [22]:
end_time = time.time()

In [23]:
total_time=end_time-start_time
print(total_time)

3495.5179409980774


In [28]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(output)

Enter a Text: 안녕하세요. 잘 지내죠
['Korean']
