In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer


In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
# Initialize stemmer and lemmatizer
stemmer = SnowballStemmer("arabic")
lemmatizer = WordNetLemmatizer()

# Define a function for text preprocessing
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
    
    return  lemmatized_tokens

train_comments = train.comment.apply(preprocess_text)
test_comments = test.comment.apply(preprocess_text)

In [5]:
# Train Word2Vec model (have the same vector representation)
word2vec_model = Word2Vec(sentences=train_comments, vector_size=100, window=5, min_count=1, workers=4, sg=1)


In [6]:
def get_text_representation(comment):
    # Initialize an empty array for the vector representation
    vector_representation = np.zeros(word2vec_model.vector_size)
    num_words = 0
    
    # Get the vectors for each word in the comment and accumulate them
    for word in comment:
        if word in word2vec_model.wv:
            vector_representation += word2vec_model.wv[word]
            num_words += 1
    
    # Compute the average vector representation
    if num_words > 0:
        vector_representation /= num_words
    
    return vector_representation


In [7]:
# Get text representations for all comments
train_representations = np.array([get_text_representation(comment) for comment in train_comments])
test_representations = np.array([get_text_representation(comment) for comment in test_comments])


X_train = train_representations
X_test = test_representations
y_train = train["label"]
y_test = test['label']

In [1]:

# Initialize and train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on test set
predictions = rf_classifier.predict(X_test)

# Evaluate model
print(classification_report(y_test, predictions ))

In [8]:
svc_classifier = SVC(kernel='linear', C=1.0, random_state=42)
svc_classifier.fit(X_train, y_train)

# Predict on test set
predictions = svc_classifier.predict(X_test)

# Evaluate model
print(classification_report(y_test, predictions ))

              precision    recall  f1-score   support

           0       0.59      0.56      0.58      1270
           1       0.66      0.68      0.67      1561

    accuracy                           0.63      2831
   macro avg       0.62      0.62      0.62      2831
weighted avg       0.63      0.63      0.63      2831



In [3]:
sgd_classifier = SGDClassifier(loss='log', alpha=0.0001, max_iter=1000, random_state=42)
sgd_classifier.fit(X_train, y_train)

# Predict on test set
predictions = sgd_classifier.predict(X_test)

# Evaluate model
print(classification_report(y_test, predictions ))