In [3]:
from google.colab import drive
drive.mount("gdrive/")

Mounted at gdrive/


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from gensim.models import KeyedVectors


# Load the GloVe word embeddings
glove_model = {}
with open("/content/gdrive/MyDrive/Colab Notebooks/additional_files/glove.6B.100d.txt", "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_model[word] = vector

# Load the dataset
df = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/datasets/4_genetic_sequencing.csv")

# print(df.shape)

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df["seq"], df["id"], test_size=0.2, random_state=42)

# Encode the text using the GloVe model
def encode_text(text):
    # Convert the text to a list of word embeddings
    word_embeddings = []
    for word in text.split():
        if word in glove_model:
            word_embeddings.append(glove_model[word])
    # Calculate the average word embedding for the text
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(100)

x_train_encoded = np.vstack(x_train.apply(lambda x: encode_text(x)))
x_test_encoded = np.vstack(x_test.apply(lambda x: encode_text(x)))

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

# Label encode the language labels
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

# Combine the encoded features
x_train_combined = np.hstack((x_train_encoded, x_train_vectorized.toarray()))
x_test_combined = np.hstack((x_test_encoded, x_test_vectorized.toarray()))

In [6]:
from sklearn.naive_bayes import MultinomialNB 
MNB = MultinomialNB()
MNB.fit(x_train_combined.clip(min=0), y_train_encoded)

y_pred_MNB = MNB.predict(x_test_combined)

from sklearn.metrics import accuracy_score 
print(f"MNB & GloVe: {accuracy_score(y_test_encoded, y_pred_MNB) * 100 :.2f}%")

MNB & GloVe: 45.22%


In [7]:
from sklearn.svm import SVC
SVM = SVC(kernel='linear')
SVM.fit(x_train_combined, y_train_encoded)

y_pred_SVM = SVM.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"SVM & GloVe: {accuracy_score(y_test_encoded, y_pred_SVM) * 100 :.2f}%")

SVM & GloVe: 45.22%


In [8]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(x_train_combined, y_train_encoded)

y_pred_RF = RF.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Random Forest & GloVe: {accuracy_score(y_test_encoded, y_pred_RF) * 100 :.2f}%")

Random Forest & GloVe: 45.22%


In [9]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier()
GB.fit(x_train_combined.clip(min=0), y_train_encoded)

y_pred_GB = GB.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Gradient Boosting Classsifier & GloVe: {accuracy_score(y_test_encoded, y_pred_GB) * 100 :.2f}%") # taking too much time

Gradient Boosting Classsifier & GloVe: 45.22%


In [10]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(x_train_combined, y_train_encoded)

y_pred_DT = DT.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Decision Tree Classsifier & GloVe: {accuracy_score(y_test_encoded, y_pred_DT) * 100 :.2f}%") # 81.77%

Decision Tree Classsifier & GloVe: 45.22%


In [11]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN.fit(x_train_combined, y_train_encoded)

y_pred_KNN = KNN.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"K-Nearest Neighbors Classifier & GloVe: {accuracy_score(y_test_encoded, y_pred_KNN) * 100 :.2f}%") # 76.26%

K-Nearest Neighbors Classifier & GloVe: 54.78%


In [12]:
from sklearn.ensemble import AdaBoostClassifier
ADA = AdaBoostClassifier()
ADA.fit(x_train_combined, y_train_encoded)

y_pred_ADA = ADA.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"AdaBoost Classifier & GloVe: {accuracy_score(y_test_encoded, y_pred_ADA) * 100 :.2f}%") # 47.00%

AdaBoost Classifier & GloVe: 45.22%


In [13]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train_combined, y_train_encoded)

y_pred_LR = LR.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Logistic Regression & GloVe: {accuracy_score(y_test_encoded, y_pred_LR) * 100 :.2f}%") # 94.54%

Logistic Regression & GloVe: 45.22%


In [14]:
from sklearn.ensemble import ExtraTreesClassifier
ET = ExtraTreesClassifier()
ET.fit(x_train_combined, y_train_encoded)

y_pred_ET = ET.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Extra Trees Classifier & GloVe: {accuracy_score(y_test_encoded, y_pred_ET) * 100 :.2f}%") # 92.21%

Extra Trees Classifier & GloVe: 45.22%


In [15]:
from sklearn.gaussian_process import GaussianProcessRegressor
GPR = GaussianProcessRegressor()
GPR.fit(x_train_combined.clip(min=0), y_train_encoded)

y_pred_GPR = GPR.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Gaussian Process Regressor & GloVe: {accuracy_score(y_test_encoded, y_pred_GPR.round()) * 100 :.2f}%") # taking too much time

Gaussian Process Regressor & GloVe: 45.22%


In [16]:
from sklearn.linear_model import RidgeClassifier
RR = RidgeClassifier()
RR.fit(x_train_combined, y_train_encoded)

y_pred_RR = RR.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Ridge Classifier & GloVe: {accuracy_score(y_test_encoded, y_pred_RR) * 100:.2f}%") # 93.23

Ridge Classifier & GloVe: 45.22%


In [17]:
from sklearn.linear_model import ElasticNet
EN = ElasticNet()
EN.fit(x_train_combined, y_train_encoded)

y_pred_EN = EN.predict(x_test_combined).round()

print(f"Elastic Net Regression & GloVe: {accuracy_score(y_test_encoded, y_pred_EN) * 100 :.2f}%") # 5.85%

Elastic Net Regression & GloVe: 45.22%


In [18]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier()
MLP.fit(x_train_combined, y_train_encoded)

y_pred_MLP = MLP.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"Multilayer Perceptron & GloVe: {accuracy_score(y_test_encoded, y_pred_MLP) * 100 :.2f}%") # 95.26%

Multilayer Perceptron & GloVe: 45.22%


In [19]:
from sklearn.linear_model import Lasso
LASSO = Lasso()
LASSO.fit(x_train_combined, y_train_encoded)

y_pred_LASSO = LASSO.predict(x_test_combined)
y_pred_LASSO = [round(i) for i in y_pred_LASSO]

from sklearn.metrics import accuracy_score
print(f"Lasso Regression & GloVe: {accuracy_score(y_test_encoded, y_pred_LASSO) * 100 :.2f}%") # 6.53%

Lasso Regression & GloVe: 45.22%


In [20]:
import xgboost as xgb
XGB = xgb.XGBClassifier()
XGB.fit(x_train_combined, y_train_encoded)

y_pred_XGB = XGB.predict(x_test_combined)

from sklearn.metrics import accuracy_score
print(f"XG Boost & GloVe: {accuracy_score(y_test_encoded, y_pred_XGB) * 100 :.2f}%") # 

XG Boost & GloVe: 45.22%
