# 1.Loading Dataset And Data manipulation

In [4]:
import pandas as pd

In [5]:
csv_file_path='offensive.csv'

In [6]:
df=pd.read_csv(csv_file_path)

In [7]:
print('\n 10 rows of dataset')
print(df.head(5))#returns first five rows of datset


 10 rows of dataset
                                              tweet          class
0                              prayer brings peace!  not_offensive
1                 You’re a subhuman piece of filth!      offensive
2  i respect your opinion, though i don’t share it!  not_offensive
3           i’ll make you wish you were never born.      offensive
4                      god’s love is unconditional!  not_offensive


In [8]:
df.info()#returns the info of datset its total no index and columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6942 entries, 0 to 6941
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   6942 non-null   object
 1   class   6940 non-null   object
dtypes: object(2)
memory usage: 108.6+ KB


In [9]:
print(df.columns.to_list())#name of columns

['tweet', 'class']


In [None]:
content_label='tweet'
category_label='class'

print(df[category_label].value_counts())

blog_contents = df[content_label].tolist()
blog_category = df[category_label].tolist()

print(f"\nLoaded {len(blog_contents)} tweets and {len(blog_category)} labels.")



class
offensive         4113
not_offensive     2823
 not_offensive       3
label                1
Name: count, dtype: int64

Loaded 6942 blog contents and 6942 labels.


In [11]:
missing_values_count = df.isnull().sum()#checks the empty column
print("\nNumber of missing values per column:")
print(missing_values_count)


Number of missing values per column:
tweet    0
class    2
dtype: int64


In [12]:
print('Shape of the dataframe before dropping missing values', df.shape)
#removes the missing values NaN or null
# df_subset will only drop rows where 'A' column has a NaN value,inplace=True, the operation modifies the original DataFrame directly instead of returning a new one.
df.dropna(subset=['tweet'], inplace=True)
df.dropna(subset=['class'], inplace=True)
values_after = df.isnull().sum()
print(values_after)
print('Shape of the dataframe after dropping missing values', df.shape)

Shape of the dataframe before dropping missing values (6942, 2)
tweet    0
class    0
dtype: int64
Shape of the dataframe after dropping missing values (6940, 2)


In [13]:
print(df.info)

<bound method DataFrame.info of                                                  tweet          class
0                                 prayer brings peace!  not_offensive
1                    You’re a subhuman piece of filth!      offensive
2     i respect your opinion, though i don’t share it!  not_offensive
3              i’ll make you wish you were never born.      offensive
4                         god’s love is unconditional!  not_offensive
...                                                ...            ...
6937      I have no strong negative feelings about you  not_offensive
6938                                I'm not your enemy  not_offensive
6939           I don't feel any resentment towards you  not_offensive
6940        My emotions aren't those of hatred for you  not_offensive
6941              I'm not holding anything against you  not_offensive

[6940 rows x 2 columns]>


In [14]:
print(df['tweet'].head(10))

0                                prayer brings peace!
1                   You’re a subhuman piece of filth!
2    i respect your opinion, though i don’t share it!
3             i’ll make you wish you were never born.
4                        god’s love is unconditional!
5             People like you should be exterminated!
6                 you’re a vile, disgusting creature.
7                        god’s love is unconditional.
8                                I’ll ruin your life.
9    I’m not a fan, but I understand why you like it!
Name: tweet, dtype: object


### Test Train Split

In [15]:

random_state=42
test_size=0.2

df_shuffled = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
split_idx = int(len(df_shuffled) * (1 - test_size))
    
train_df = df_shuffled.iloc[:split_idx]
test_df = df_shuffled.iloc[split_idx:]

X_train_raw = train_df['tweet'].tolist()
y_train_str = train_df['class'].tolist()
X_test_raw = test_df['tweet'].tolist()
y_test_str = test_df['class'].tolist()

print(f"Training set size: {len(X_train_raw)} documents")
print(f"Testing set size: {len(X_test_raw)} documents")

Training set size: 5552 documents
Testing set size: 1388 documents


# 2. Logistic Regression Implementation from Scratch

In [None]:
import numpy as np
import re
import string

def handle_negations(words):
    """
    Finds negation words and prefixes the following word with 'not_'.
    """
    negation_words = {"not", "no", "n't", "never"}
    processed_words = []
    i = 0
    while i < len(words):
        word = words[i]
        if word in negation_words or re.search(r'\w+n\'t$', word):
            if i + 1 < len(words):
                processed_words.append('not_' + words[i+1])
                i += 2
            else:
                processed_words.append(word)
                i += 1
        else:
            processed_words.append(word)
            i += 1
    return processed_words

def custom_tfidf_vectorizer(tweets):
    """
    Converts a list of text documents to a TF-IDF matrix with negation handling.
    """
    # 1. Build vocabulary and get term frequencies
    vocabulary = {}
    doc_word_counts = []
    punctuation_to_remove = string.punctuation.replace("'", "")
    for tweet in tweets:
        processed_text = tweet.lower().translate(str.maketrans('', '', punctuation_to_remove))
        words = handle_negations(processed_text.split())
        doc_counts = {}
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = len(vocabulary)
            doc_counts[word] = doc_counts.get(word, 0) + 1
        doc_word_counts.append(doc_counts)

    # 2. Calculate Inverse Document Frequency (IDF)
    num_documents = len(tweets)
    idf = {}
    for word, idx in vocabulary.items():
        doc_count = sum(1 for doc in doc_word_counts if word in doc)
        # Use a smoothed version to avoid division by zero
        idf[word] = np.log((1 + num_documents) / (1 + doc_count)) + 1

    # 3. Create TF-IDF feature matrix
    X = np.zeros((num_documents, len(vocabulary)))
    for i, doc_counts in enumerate(doc_word_counts):
        for word, count in doc_counts.items():
            tf = count / sum(doc_counts.values())
            tfidf = tf * idf[word]
            X[i, vocabulary[word]] = tfidf

    return X, vocabulary, idf

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def train_logistic_regression(X, y, learning_rate=0.1, n_iterations=1000):
    n_samples, n_features = X.shape
    w = np.zeros(n_features)
    b = 0

    # Gradient descent
    for i in range(n_iterations):
        # Calculate linear model and predictions
        linear_model = np.dot(X, w) + b
        y_predicted = sigmoid(linear_model)

        # Compute gradients
        dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
        db = (1 / n_samples) * np.sum(y_predicted - y)

        # Update parameters
        w -= learning_rate * dw
        b -= learning_rate * db

        # Calculate and display the loss every 100 iterations
        if i % 100 == 0:
            # Binary Cross-Entropy (Logistic) Loss
            loss = -np.mean(y * np.log(y_predicted) + (1 - y) * np.log(1 - y_predicted))
            print(f"Iteration {i}/{n_iterations}, Loss: {loss:.4f}")
    
    return w, b

def predict_logistic_regression(X, w, b):
    linear_model = np.dot(X, w) + b
    y_predicted = sigmoid(linear_model)
    y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
    return np.array(y_predicted_cls)


# 3. Data Transformation and Training

In [17]:
# Convert string labels to numerical (0 for not_offensive, 1 for offensive)
y_train = np.array([1 if c.strip() == 'offensive' else 0 for c in y_train_str])
y_test = np.array([1 if c.strip() == 'offensive' else 0 for c in y_test_str])

# Preprocess and vectorize text data using TF-IDF
X_train, vocabulary, idf_scores = custom_tfidf_vectorizer(X_train_raw)

# Handle class imbalance with oversampling
from collections import Counter

def oversample(X, y):
    """
    A simple random oversampling implementation for imbalanced classes.
    It duplicates samples from the minority class to match the majority class size.
    """
    counts = Counter(y)
    minority_class = min(counts, key=counts.get)
    majority_class = max(counts, key=counts.get)
    
    minority_indices = np.where(y == minority_class)[0]
    majority_indices = np.where(y == majority_class)[0]
    
    num_minority_samples = len(minority_indices)
    num_majority_samples = len(majority_indices)
    
    # Calculate how many samples to add to the minority class
    num_to_add = num_majority_samples - num_minority_samples
    
    # Randomly select samples from the minority class to duplicate
    resample_indices = np.random.choice(minority_indices, size=num_to_add, replace=True)
    
    # Create the resampled feature matrix and labels
    X_resampled = np.vstack([X, X[resample_indices]])
    y_resampled = np.concatenate([y, y[resample_indices]])
    
    # Shuffle the resampled data to mix the classes
    shuffler = np.random.permutation(len(y_resampled))
    X_resampled = X_resampled[shuffler]
    y_resampled = y_resampled[shuffler]
    
    return X_resampled, y_resampled

X_train_resampled, y_train_resampled = oversample(X_train, y_train)

print("Class distribution before oversampling:", Counter(y_train))
print("Class distribution after oversampling:", Counter(y_train_resampled))

# Ensure test data has the same features and TF-IDF scores as training data
def map_test_to_tfidf(test_raw, vocab, idf_scores):
    X = np.zeros((len(test_raw), len(vocab)))
    punctuation_to_remove = string.punctuation.replace("'", "")
    for i, tweet in enumerate(test_raw):
        processed_text = tweet.lower().translate(str.maketrans('', '', punctuation_to_remove))
        words = handle_negations(processed_text.split())
        doc_word_counts = {}
        for word in words:
            if word in vocab:
                doc_word_counts[word] = doc_word_counts.get(word, 0) + 1
        
        if sum(doc_word_counts.values()) > 0:
            for word, count in doc_word_counts.items():
                tf = count / sum(doc_word_counts.values())
                tfidf = tf * idf_scores.get(word, 0)
                X[i, vocab[word]] = tfidf
    return X

X_test = map_test_to_tfidf(X_test_raw, vocabulary, idf_scores)

print(f"Training data shape after oversampling: {X_train_resampled.shape}")
print(f"Testing data shape: {X_test.shape}")

# Train the Logistic Regression model on the resampled data and get weights/bias
print("\nStarting model training...")
weights, bias = train_logistic_regression(X_train_resampled, y_train_resampled, learning_rate=0.1, n_iterations=1000)
print("Training complete.")


Class distribution before oversampling: Counter({np.int64(1): 3303, np.int64(0): 2249})
Class distribution after oversampling: Counter({np.int64(0): 3303, np.int64(1): 3303})
Training data shape after oversampling: (6606, 1016)
Testing data shape: (1388, 1016)

Starting model training...
Iteration 0/1000, Loss: 0.6931
Iteration 100/1000, Loss: 0.5607
Iteration 200/1000, Loss: 0.4711
Iteration 300/1000, Loss: 0.4077
Iteration 400/1000, Loss: 0.3607
Iteration 500/1000, Loss: 0.3245
Iteration 600/1000, Loss: 0.2958
Iteration 700/1000, Loss: 0.2723
Iteration 800/1000, Loss: 0.2529
Iteration 900/1000, Loss: 0.2364
Training complete.


# 4. Prediction and Evaluation

In [18]:
from collections import Counter

def accuracy_score(y_true, y_pred):
    """Calculates the accuracy of the predictions."""
    return np.sum(y_true == y_pred) / len(y_true)

# Make predictions on the test set using the trained weights and bias
y_pred = predict_logistic_regression(X_test, weights, bias)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the custom Logistic Regression model: {accuracy:.2f}")


Accuracy of the custom Logistic Regression model: 0.98


## 5. Saving and Loading the Model Parameters

In [None]:
import pickle

def save_model_params(weights, bias, vocabulary, idf_scores, filename='logistic_regression_tfidf.pkl'):
    """Save the trained model parameters and TF-IDF data to a file."""
    with open(filename, 'wb') as f:
        pickle.dump({
            'weights': weights,
            'bias': bias,
            'vocabulary': vocabulary,
            'idf_scores': idf_scores
        }, f)
    print(f'Model parameters saved to {filename}')

def load_model_params(filename='logistic_regression_tfidf.pkl'):
    """Load the trained model parameters from a file."""
    with open(filename, 'rb') as f:
        params = pickle.load(f)
    print(f'Model parameters loaded from {filename}')
    return params['weights'], params['bias'], params['vocabulary'], params['idf_scores']

# Save the weights, bias, vocabulary, and idf_scores returned from the training function
save_model_params(weights, bias, vocabulary, idf_scores)

# Load the parameters and print their shapes to verify
weights, bias, loaded_vocab, loaded_idf = load_model_params()
print("\nLoaded weights shape:", weights.shape)
print("Loaded bias:", bias)
print("Loaded vocabulary size:", len(loaded_vocab))
print("Loaded IDF scores size:", len(loaded_idf))


Model parameters saved to logistic_regression.pkl
Model parameters loaded from logistic_regression_tfidf.pkl

Loaded weights shape: (1016,)
Loaded bias: -0.13166330297141485
Loaded vocabulary size: 1016
Loaded IDF scores size: 1016


In [20]:
def classification_report_manual(y_true, y_pred, target_names):
    report = []
    for i, label in enumerate(target_names):
        tp = np.sum((y_true == i) & (y_pred == i))
        fp = np.sum((y_true != i) & (y_pred == i))
        fn = np.sum((y_true == i) & (y_pred != i))
        support = np.sum(y_true == i)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        report.append((label, precision, recall, f1, support))
    # Macro average
    macro_precision = np.mean([r[1] for r in report])
    macro_recall = np.mean([r[2] for r in report])
    macro_f1 = np.mean([r[3] for r in report])
    macro_support = np.sum([r[4] for r in report])
    # Compute accuracy for this report
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    print(f"Overall Accuracy: {accuracy:.4f}\n")
    print("Classification Report:")
    print("Category         Precision    Recall  F1-Score   Support")
    print("--------------------------------------------------------")
    for row in report:
        print(f"{row[0]:<18} {row[1]:>9.2f} {row[2]:>9.2f} {row[3]:>9.2f} {row[4]:>9}")
    print("--------------------------------------------------------")
    print(f"Macro Average    {macro_precision:>9.2f} {macro_recall:>9.2f} {macro_f1:>9.2f} {macro_support:>9}")


target_names = ['not_offensive', 'offensive']
classification_report_manual(y_test, y_pred, target_names)

Overall Accuracy: 0.9820

Classification Report:
Category         Precision    Recall  F1-Score   Support
--------------------------------------------------------
not_offensive           0.98      0.97      0.98       578
offensive               0.98      0.99      0.98       810
--------------------------------------------------------
Macro Average         0.98      0.98      0.98      1388
