#SVM W2V

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Load your training data
# Assuming the file is CSV and the relevant columns are 'comments' and 'labels'
data = pd.read_csv('/content/modified_hatespeech.csv')
comments = data['Comment']
labels = data['Label']

# Preprocess the data: simple tokenization
# In practice, you might need more complex preprocessing (e.g., lowercasing, removing stop words)
tokenized_comments = [comment.lower().split() for comment in comments]

# Train a Word2Vec model or load a pre-trained one
# This example trains a new model from the input comments
word2vec_model = Word2Vec(sentences=tokenized_comments, vector_size=100, window=5, min_count=1, workers=4)

# Function to transform a sentence into a mean vector
def comment_to_mean_vector(comment):
    words = comment.lower().split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word_vectors, axis=0)

# Transform each comment into a vector
X = np.array([comment_to_mean_vector(comment) for comment in comments])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)

# Train an SVM classifier
svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='linear'))
svm_classifier.fit(X_train, y_train)

# If you have test labels, you can evaluate the classifier here
# predictions = svm_classifier.predict(X_test)
# Evaluate your classifier (e.g., accuracy_score(y_test, predictions))
accuracy = accuracy_score(y_test, predictions)
print("Accuracy Score:", accuracy)

# Generate a classification report
report = classification_report(y_test, predictions)
print("\nClassification Report:\n", report)
print("Model trained successfully!")


Accuracy Score: 0.2830188679245283

Classification Report:
               precision    recall  f1-score   support

 BodyShaming       0.00      0.00      0.00         5
    Misogyny       0.37      0.37      0.37        19
  RapeThreat       0.32      0.35      0.33        17
      Sexism       0.15      0.17      0.16        12

    accuracy                           0.28        53
   macro avg       0.21      0.22      0.22        53
weighted avg       0.27      0.28      0.28        53

Model trained successfully!


In [None]:
import pickle

# Save the Word2Vec model and the SVM classifier using pickle
with open('word2vec_model.pkl', 'wb') as file:
    pickle.dump(word2vec_model, file)

with open('svm_classifier.pkl', 'wb') as file:
    pickle.dump(svm_classifier, file)

print("Models saved successfully using pickle!")


Models saved successfully using pickle!


In [None]:
import pandas as pd
import numpy as np
from joblib import load
from gensim.models import Word2Vec

# Load the models
word2vec_model = load('/content/word2vec_model.pkl')
svm_classifier = load('/content/svm_classifier.pkl')

# Load your test data
test_data = pd.read_csv('/content/df.csv')
test_comments = test_data['Comment']

# Function to transform comments into mean vectors
def comment_to_mean_vector(comment, word2vec_model):
    words = comment.lower().split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word_vectors, axis=0)

# Transform test comments into vectors
X_test = np.array([comment_to_mean_vector(comment, word2vec_model) for comment in test_comments])

# Make predictions
predictions = svm_classifier.predict(X_test)

# Optionally, you can now review or use the predictions
print("Predictions for the test data:", predictions)


Predictions for the test data: ['Misogyny' 'Sexism' 'Sexism' 'Misogyny' 'Sexism' 'Sexism' 'Sexism'
 'Misogyny' 'Misogyny' 'Misogyny' 'Sexism' 'Misogyny' 'Misogyny'
 'Misogyny' 'Sexism' 'Sexism' 'BodyShaming' 'Sexism' 'Sexism' 'Misogyny'
 'RapeThreat' 'Sexism' 'Misogyny' 'Misogyny' 'Sexism' 'Misogyny' 'Sexism'
 'Misogyny' 'Sexism' 'Misogyny' 'Sexism' 'Misogyny' 'Misogyny' 'Sexism'
 'Misogyny' 'Sexism' 'Misogyny' 'RapeThreat' 'Misogyny' 'Misogyny'
 'BodyShaming' 'RapeThreat' 'BodyShaming' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny' 'BodyShaming' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny' 'Misogyny' 'RapeThreat' 'Misogyny' 'Misogyny' 'RapeThreat'
 'Misogyny' 'Sexism' 'Misogyny' 'BodyShaming' 'Misogyny' 'RapeThreat'
 'Misogyny' 'Misogyny' 'RapeThreat' 'BodyShaming' 'Misogyny' 'Misogyny'
 'Misogyny' 'Sexism' 'Misogyny' 'Misogyny' 'BodyShaming' 'Misogyny'
 'Misogyny' 'Misogyny' 'RapeThreat' 'Misogyny' 'Misogyny'

In [None]:
import pandas as pd

# Assuming 'test_comments' is your list of comments and 'predictions' are the predicted labels from the model
test_data['Predicted_Labels'] = predictions  # Add the predictions as a new column

# Save the DataFrame to a new CSV file
test_data.to_csv('labeled_test_data.csv', index=False)

print("The test data with predicted labels has been saved to 'labeled_test_data.csv'.")


The test data with predicted labels has been saved to 'labeled_test_data.csv'.


#W2V RF

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# Load your data
train_data = pd.read_csv('/content/modified_hatespeech.csv')

# Ensure text data is tokenized properly
train_data['processed_comments'] = train_data['Comment'].apply(lambda x: x.lower().split())

# Train Word2Vec model
w2v_model = Word2Vec(sentences=train_data['processed_comments'], vector_size=100, window=5, min_count=1, workers=4)

# Function to transform each comment into a vector by averaging its word vectors
def document_vector(doc):
    # Removing out-of-vocabulary words and retrieving vectors with the `wv` attribute
    vectors = [w2v_model.wv[word] for word in doc if word in w2v_model.wv.key_to_index]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

# Apply the function to each row to create document vectors
train_data['doc_vector'] = train_data['processed_comments'].apply(document_vector)

# Prepare data for Random Forest
X = list(train_data['doc_vector'])
y = train_data['Label']

# Split data for training and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Optional: Evaluate the classifier if you have labeled test data
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

 BodyShaming       0.00      0.00      0.00         5
    Misogyny       0.35      0.37      0.36        19
  RapeThreat       0.37      0.59      0.45        17
      Sexism       0.17      0.08      0.11        12

    accuracy                           0.34        53
   macro avg       0.22      0.26      0.23        53
weighted avg       0.28      0.34      0.30        53



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pickle

# Save the trained model to a file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(clf, file)

# Optionally save the Word2Vec model as well
with open('word2vec_model.pkl', 'wb') as file:
    pickle.dump(w2v_model, file)


In [None]:
import pandas as pd
import numpy as np
import pickle

# Load the models from disk
with open('random_forest_model.pkl', 'rb') as file:
    clf = pickle.load(file)

with open('word2vec_model.pkl', 'rb') as file:
    w2v_model = pickle.load(file)

# Load your testing data
test_data = pd.read_csv('/content/df.csv')

# Preprocess the testing data (must match training preprocessing)
test_data['processed_comments'] = test_data['Comment'].apply(lambda x: x.lower().split())

# Convert comments in the test set to vectors
def document_vector(doc):
    vectors = [w2v_model.wv[word] for word in doc if word in w2v_model.wv.key_to_index]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

test_data['doc_vector'] = test_data['processed_comments'].apply(document_vector)

# Prepare data for prediction
X_test = list(test_data['doc_vector'])

# Predict using the classifier
predictions = clf.predict(X_test)

# Since there are no true labels, you can examine predictions in various ways
# For example, print out or analyze the predicted labels
print(predictions)


['Sexism' 'Sexism' 'Misogyny' 'RapeThreat' 'Sexism' 'Sexism' 'Sexism'
 'Misogyny' 'RapeThreat' 'RapeThreat' 'Sexism' 'Misogyny' 'RapeThreat'
 'RapeThreat' 'Sexism' 'Sexism' 'Misogyny' 'Sexism' 'Sexism' 'RapeThreat'
 'Misogyny' 'Sexism' 'Misogyny' 'RapeThreat' 'Sexism' 'RapeThreat'
 'Sexism' 'RapeThreat' 'Misogyny' 'RapeThreat' 'Misogyny' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Misogyny' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Misogyny' 'RapeThreat' 'Misogyny'
 'Misogyny' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Misogyny' 'Misogyny'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Misogyny' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat'
 'Misogyny' 'Misogyny' 'RapeThreat' 'Misogyny' 'RapeThreat' 'Sexism'
 'RapeThreat' 'Misogyny' 'RapeThreat' 'Sexism' 'RapeThreat' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Misogyny' 'Misogyny' 'RapeThreat'
 'Misogyny' 'RapeThrea

In [None]:
import pandas as pd

# Assuming 'test_comments' is your list of comments and 'predictions' are the predicted labels from the model
test_data['Predicted_Labels'] = predictions  # Add the predictions as a new column

# Save the DataFrame to a new CSV file
test_data.to_csv('w2v_rf.csv', index=False)

print("The test data with predicted labels has been saved to 'labeled_test_data.csv'.")


The test data with predicted labels has been saved to 'labeled_test_data.csv'.


#TFIDF SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Load your training data
# Assuming the file is CSV and the relevant columns are 'comments' and 'labels'
data = pd.read_csv('/content/modified_hatespeech.csv')
comments = data['Comment']
labels = data['Label']

# Initialize a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_features=1000)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(comments, labels, test_size=0.25, random_state=42)

# Create a SVM classifier wrapped in a pipeline with TF-IDF
svm_pipeline = make_pipeline(tfidf_vectorizer, SVC(kernel='linear'))

# Train the SVM classifier on the training data
svm_pipeline.fit(X_train, y_train)

# Optionally evaluate the classifier if you have test labels
predictions = svm_pipeline.predict(X_test)
print(classification_report(y_test, predictions))

print("SVM model trained successfully!")


              precision    recall  f1-score   support

 BodyShaming       0.00      0.00      0.00         5
    Misogyny       0.46      0.32      0.37        19
  RapeThreat       0.47      0.82      0.60        17
      Sexism       0.50      0.42      0.45        12

    accuracy                           0.47        53
   macro avg       0.36      0.39      0.36        53
weighted avg       0.43      0.47      0.43        53

SVM model trained successfully!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pickle

# Save the SVM pipeline using pickle
with open('svm_pipeline.pkl', 'wb') as file:
    pickle.dump(svm_pipeline, file)

print("Model saved successfully using pickle!")


Model saved successfully using pickle!


In [None]:
with open('svm_pipeline.pkl', 'rb') as file:
    loaded_svm_pipeline = pickle.load(file)

In [None]:
import pandas as pd
import numpy as np
from joblib import load

# Load the pre-trained SVM pipeline (change the filename as needed)
svm_pipeline = load('/content/svm_pipeline.pkl')

# Load your test data
test_data = pd.read_csv('/content/df.csv')
test_comments = test_data['Comment']
# Make predictions on the test data using the loaded SVM pipeline
predictions = svm_pipeline.predict(test_comments)

# Optionally, you can now review or use the predictions
print("Predictions for the test data:", predictions)


Predictions for the test data: ['Misogyny' 'Sexism' 'Sexism' 'RapeThreat' 'Sexism' 'Sexism' 'Sexism'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Sexism' 'Misogyny' 'RapeThreat'
 'RapeThreat' 'Sexism' 'Sexism' 'RapeThreat' 'Sexism' 'Sexism'
 'RapeThreat' 'RapeThreat' 'Sexism' 'RapeThreat' 'RapeThreat' 'Sexism'
 'RapeThreat' 'Sexism' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Sexism'
 'RapeThreat' 'RapeThreat' 'Sexism' 'Misogyny' 'Sexism' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Sexism' 'RapeThreat' 'RapeThreat'
 'Misogyny' 'RapeThreat' 'RapeThreat' 'Misogyny' 'RapeThreat' 'Misogyny'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Misogyny' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'Sexism' 'Misogyny'
 'Sexism' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat'
 'Misogyny' 'Sexism' 'RapeThreat' 'RapeThreat' 'RapeThreat' 'RapeThreat'
 'RapeThreat' 'RapeThreat' 'RapeThreat' 'BodyShaming'

In [None]:
import pandas as pd

# Assuming 'test_comments' is your list of comments and 'predictions' are the predicted labels from the model
test_data['Predicted_Labels'] = predictions  # Add the predictions as a new column

# Save the DataFrame to a new CSV file
test_data.to_csv('svm_tfidf1.csv', index=False)

print("The test data with predicted labels has been saved to 'labeled_test_data.csv'.")


The test data with predicted labels has been saved to 'labeled_test_data.csv'.


#tf idf rf

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Load your training data
# Replace 'path_to_your_data.csv', 'comments', and 'labels' with your actual file path and column names
data = pd.read_csv('/content/modified_hatespeech.csv')
comments = data['Comment']  # This should be your column with text data
labels = data['Label']      # This should be your column with labels

# Initialize a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_features=1000)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(comments, labels, test_size=0.25, random_state=42)

# Create a Random Forest classifier wrapped in a pipeline with TF-IDF
rf_pipeline = make_pipeline(tfidf_vectorizer, RandomForestClassifier(n_estimators=100, random_state=42))

# Train the Random Forest classifier on the training data
rf_pipeline.fit(X_train, y_train)

# Optionally evaluate the classifier if you have test labels
predictions = rf_pipeline.predict(X_test)
print(classification_report(y_test, predictions))

print("Random Forest model trained successfully!")


              precision    recall  f1-score   support

 BodyShaming       0.00      0.00      0.00         5
    Misogyny       0.41      0.68      0.51        19
  RapeThreat       0.64      0.41      0.50        17
      Sexism       0.60      0.50      0.55        12

    accuracy                           0.49        53
   macro avg       0.41      0.40      0.39        53
weighted avg       0.49      0.49      0.47        53

Random Forest model trained successfully!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pickle

# Save the trained Random Forest pipeline to a file
with open('random_forest_pipeline.pkl', 'wb') as file:
    pickle.dump(rf_pipeline, file)

print("Model saved successfully using pickle!")


Model saved successfully using pickle!


In [None]:
import pandas as pd
import numpy as np
import pickle

# Load the pre-trained Random Forest pipeline from a pickle file
with open('/content/random_forest_pipeline.pkl', 'rb') as file:
    rf_pipeline = pickle.load(file)

# Load your test data
test_data = pd.read_csv('/content/df.csv')
test_comments = test_data['Comment']
# Make predictions on the test data using the loaded Random Forest pipeline
predictions = rf_pipeline.predict(test_comments)

# Optionally, you can now review or use the predictions
print("Predictions for the test data:", predictions)


Predictions for the test data: ['Misogyny' 'Sexism' 'Sexism' 'Misogyny' 'Sexism' 'Sexism' 'Sexism'
 'Misogyny' 'Misogyny' 'Misogyny' 'Sexism' 'Misogyny' 'Misogyny'
 'Misogyny' 'Sexism' 'Sexism' 'RapeThreat' 'Sexism' 'Sexism' 'Misogyny'
 'Misogyny' 'Sexism' 'Misogyny' 'Misogyny' 'Sexism' 'Misogyny' 'Sexism'
 'Misogyny' 'RapeThreat' 'Misogyny' 'Sexism' 'Misogyny' 'Misogyny'
 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny' 'Misogyny' 'RapeThreat' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny'
 'Sexism' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Sexism'
 'Misogyny' 'Misogyny' 'Sexism' 'RapeThreat' 'Misogyny' 'Misogyny'
 'Misogyny' 'RapeThreat' 'Misogyny' 'Sexism' 'BodyShaming' 'Misogyny'
 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny' 'Misogyny'
 'Misogyny

In [None]:
import pandas as pd

# Assuming 'test_comments' is your list of comments and 'predictions' are the predicted labels from the model
test_data['Predicted_Labels'] = predictions  # Add the predictions as a new column

# Save the DataFrame to a new CSV file
test_data.to_csv('rf_tfidf.csv', index=False)

print("The test data with predicted labels has been saved to 'labeled_test_data.csv'.")


The test data with predicted labels has been saved to 'labeled_test_data.csv'.
