In [1]:
from google.colab import drive
drive.mount('gdrive/')

Mounted at gdrive/


In [2]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4393198 sha256=e7e7fadfb9fb3317e8cce17d7c9bbb63270f2d7658a1cc6e60879dd9e3c5c811
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.4


In [39]:
import pandas
df = pandas.read_csv("/content/gdrive/MyDrive/Colab Notebooks/datasets/4_genetic_sequencing.csv")

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

import fasttext
ft_model = fasttext.load_model('/content/gdrive/MyDrive/Colab Notebooks/additional_files/lid.176.bin')

def encode_text(text):
    text = text.replace('\n', '')
    return ft_model.get_sentence_vector(text)

train_df['text_encoded'] = train_df['seq'].apply(lambda x: encode_text(x))
test_df['text_encoded'] = test_df['seq'].apply(lambda x: encode_text(x))

import numpy as np
x_train = np.vstack(train_df['text_encoded'].values)
x_test = np.vstack(test_df['text_encoded'].values)
y_train = train_df['id']
y_test = test_df['id']

language_labels = df['id'].unique()
label_to_id = {label: i for i, label in enumerate(language_labels)}

y_train = y_train.apply(lambda x: label_to_id[x])
y_test = y_test.apply(lambda x: label_to_id[x])



#### 1. Using `Multinomial NB`

In [40]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(x_train.clip(min=0), y_train)

y_pred_MNB = MNB.predict(x_test.clip(min=0))

from sklearn.metrics import accuracy_score
print(f"MNB & FastText: {accuracy_score(y_test, y_pred_MNB) * 100 :.2f}%")

MNB & FastText: 48.74%


#### 2. Using `SVM`

In [41]:
from sklearn.svm import SVC
SVM = SVC(kernel='linear')
SVM.fit(x_train, y_train)

y_pred_SVM = SVM.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"SVM & FastText: {accuracy_score(y_test, y_pred_SVM) * 100 :.2f}%")

SVM & FastText: 64.47%


#### 3. Using `Random Forest`

In [42]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(x_train.clip(min=0), y_train)

y_pred_RF = RF.predict(x_test.clip(min=0))

from sklearn.metrics import accuracy_score
print(f"Random Forest & FastText: {accuracy_score(y_test, y_pred_RF) * 100 :.2f}%")

Random Forest & FastText: 62.36%


#### 4. Using `Gradient Boosting Classifier`

In [43]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier()
GB.fit(x_train, y_train)

y_pred_GB = GB.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"Gradient Boosting Classsifier & FastText: {accuracy_score(y_test, y_pred_GB) * 100 :.2f}%")

Gradient Boosting Classsifier & FastText: 63.20%


#### 5. Using `Decision Tree Classifier`

In [44]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)

y_pred_DT = DT.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"Decision Tree Classsifier & FastText: {accuracy_score(y_test, y_pred_DT) * 100 :.2f}%")

Decision Tree Classsifier & FastText: 55.62%


#### 6. Using `K-Nearest Neighbors Classifier`

In [45]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN.fit(x_train, y_train)

y_pred_KNN = KNN.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"K-Nearest Neighbors Classifier & FastText: {accuracy_score(y_test, y_pred_KNN) * 100 :.2f}%")

K-Nearest Neighbors Classifier & FastText: 61.66%


#### 7. Using `AdaBoost Classifier`

In [46]:
from sklearn.ensemble import AdaBoostClassifier
ADA = AdaBoostClassifier()
ADA.fit(x_train, y_train)

y_pred_ADA = ADA.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"AdaBoost Classifier & FastText: {accuracy_score(y_test, y_pred_ADA) * 100 :.2f}%")

AdaBoost Classifier & FastText: 60.53%


#### 8. Using `Logistic Regression`

In [47]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train, y_train)

y_pred_LR = LR.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"Logistic Regression & FastText: {accuracy_score(y_test, y_pred_LR) * 100 :.2f}%")

Logistic Regression & FastText: 63.76%


#### 9. Using `Extra Trees Classifier`

In [50]:
from sklearn.ensemble import ExtraTreesClassifier
ET = ExtraTreesClassifier()
ET.fit(x_train, y_train)

y_pred_ET = ET.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"Extra Trees Classifier & TF-IDF: {accuracy_score(y_test, y_pred_ET) * 100 :.2f}%")

Extra Trees Classifier & TF-IDF: 65.03%


#### 10. Using `Gaussian Process Regressor`

In [51]:
from sklearn.gaussian_process import GaussianProcessRegressor
GPR = GaussianProcessRegressor()
GPR.fit(x_train, y_train)

y_pred_GPR = GPR.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"Gaussian Process Regressor & TF-IDF: {accuracy_score(y_test, y_pred_GPR.round()) * 100 :.2f}%")

Gaussian Process Regressor & TF-IDF: 22.19%


#### 11. Using `Ridge Classifier`

In [52]:
from sklearn.linear_model import RidgeClassifier
RR = RidgeClassifier()
RR.fit(x_train, y_train)

y_pred_RR = RR.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"Ridge Classifier & TF-IDF: {accuracy_score(y_test, y_pred_RR) * 100:.2f}%")

Ridge Classifier & TF-IDF: 64.04%


#### 12. Using `Elastic Net Regression`

In [53]:
from sklearn.linear_model import ElasticNet
EN = ElasticNet()
EN.fit(x_train, y_train)

y_pred_EN = EN.predict(x_test).round()

print(f"Elastic Net Regression & TF-IDF: {accuracy_score(y_test, y_pred_EN) * 100 :.2f}%")

Elastic Net Regression & TF-IDF: 45.22%


#### 13. Using `Multilayer Perceptron`

In [54]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier()
MLP.fit(x_train, y_train)

y_pred_MLP = MLP.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"Multilayer Perceptron & TF-IDF: {accuracy_score(y_test, y_pred_MLP) * 100 :.2f}%")

Multilayer Perceptron & TF-IDF: 64.04%




#### 14. Using `Lasso Regressor`

In [55]:
from sklearn.linear_model import Lasso
LASSO = Lasso()
LASSO.fit(x_train, y_train)

y_pred_LASSO = LASSO.predict(x_test)
y_pred_LASSO = [round(val) for val in y_pred_LASSO]

from sklearn.metrics import accuracy_score
print(f"Lasso Regression & TF-IDF: {accuracy_score(y_test, y_pred_LASSO) * 100 :.2f}%")

Lasso Regression & TF-IDF: 45.22%


#### 15. Using `XG Boost`

In [56]:
import xgboost as xgb
XGB = xgb.XGBClassifier()
XGB.fit(x_train, y_train)

y_pred_XGB = XGB.predict(x_test)

from sklearn.metrics import accuracy_score
print(f"XG Boost & TF-IDF: {accuracy_score(y_test, y_pred_XGB) * 100 :.2f}%")

XG Boost & TF-IDF: 64.19%


#### 16. Using `CNN Classifier`

In [None]:
import tensorflow as tf

CNN = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(x_train.shape[1], 1)),
    tf.keras.layers.MaxPooling1D(5),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(language_labels), activation='softmax')
])

CNN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

x_train_cnn = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test_cnn = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

history = CNN.fit(x_train_cnn, y_train, epochs=100, batch_size=32, validation_data=(x_test_cnn, y_test))

test_loss, test_acc = CNN.evaluate(x_test_cnn, y_test)

print(f"CNN & FastText: {test_acc * 100:.2f}%")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78