In [None]:
!pip install datasets
!pip install nltk

In [None]:
import datasets 
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df.describe()



  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,hatespeech,hate_speech_score,infitms,outfitms,annotator_severity,std_err,annotator_infitms,annotator_outfitms,hypothesis,annotator_age
count,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,...,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135451.0
mean,23530.416138,5567.097812,1.281352,2.954307,2.828875,2.56331,2.278638,2.698575,1.846211,1.052045,...,0.744733,-0.567428,1.034322,1.001052,-0.018817,0.300588,1.007158,1.011841,0.014589,37.910772
std,12387.194125,3230.508937,1.023542,1.231552,1.309548,1.38983,1.370876,0.8985,1.402372,1.345706,...,0.93226,2.380003,0.496867,0.791943,0.487261,0.23638,0.269876,0.675863,0.613006,11.641276
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-8.34,0.1,0.07,-1.82,0.02,0.39,0.28,-1.578693,18.0
25%,18148.0,2719.0,0.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,...,0.0,-2.33,0.71,0.56,-0.38,0.03,0.81,0.67,-0.341008,29.0
50%,20052.0,5602.5,1.0,3.0,3.0,3.0,3.0,3.0,2.0,0.0,...,0.0,-0.34,0.96,0.83,-0.02,0.34,0.97,0.85,0.110405,35.0
75%,32038.25,8363.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,...,2.0,1.41,1.3,1.22,0.35,0.42,1.17,1.13,0.449555,45.0
max,50070.0,11142.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,2.0,6.3,5.9,9.0,1.36,1.9,2.01,9.0,0.987511,81.0


In [None]:
def clean_data(df, col, clean_col):
    stop_words = set(stopwords.words('english'))
    st = SnowballStemmer('english')

    df[clean_col] = df[col].apply(lambda x: x.lower().strip())
    df[clean_col] = df[clean_col].apply(lambda x: re.sub(' +', ' ', x))
    df[clean_col] = df[clean_col].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
    df[clean_col] = df[clean_col].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))

    return df

In [None]:
df = clean_data(df, 'text', 'clean_text')

In [None]:
def find_best_regression_model(df, early_stopping_rounds=5, random_state=42):
    

    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['hate_speech_score'], test_size=0.2, random_state=42)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state)

    models = [
        ('ridge', Ridge(alpha=0.5, random_state=random_state, tol=0.001)),
        ('linear', LinearRegression()),
        ('lasso', Lasso(alpha=0.5, random_state=random_state, tol=0.001))
    ]
    best_mse = np.inf
    best_model = None

    for name, model in models:
       
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=1000)),
            (name, model)
        ])

        pipeline.fit(X_train, y_train)

        
        best_mse_val = np.inf
        for i in range(early_stopping_rounds):

            y_train_pred = pipeline.predict(X_train)

            mse_train = mean_squared_error(y_train, y_train_pred)
            r_squared_train = r2_score(y_train, y_train_pred)

            y_val_pred = pipeline.predict(X_val)

            mse_val = mean_squared_error(y_val, y_val_pred)
            r_squared_val = r2_score(y_val, y_val_pred)

            print("Model: ", name)
            print("Round ", i+1)
            print("Training MSE: ", mse_train)
            print("Training R squared: {}".format(r_squared_train))
            print("Validation MSE: ", mse_val)
            print("Validation R squared: {}".format(r_squared_val))

            if mse_val < best_mse_val:
                best_mse_val = mse_val
                best_pipeline = pipeline
            else:
                print("Early stopping round ", i+1)
                break

            pipeline.fit(X_train, y_train)

        y_pred = best_pipeline.predict(X_test)

        mse_test = mean_squared_error(y_test, y_pred)
        r_squared = r2_score(y_test, y_pred)
        print("Model: ", name)
        print("Test MSE: ", mse_test)
        print("R squared: {}".format(r_squared))
        print("\n")

        if mse_test < best_mse:
            best_mse = mse_test
            best_r_squared = r_squared
            best_model = (name, best_pipeline)

    print("Best model: ", best_model[0])
    print("Test MSE: ", best_mse)
    print("Test R squared: {}".format(r_squared))

    from joblib import dump
    dump(best_model[1], 'best_regression_model.joblib')

In [None]:
testing = find_best_regression_model(df)

Model:  ridge
Round  1
Training MSE:  1.4577634066574034
Training R squared: 0.7436166764899279
Validation MSE:  1.5056167304077988
Validation R squared: 0.7344463822454461
Model:  ridge
Round  2
Training MSE:  1.4577634066574034
Training R squared: 0.7436166764899279
Validation MSE:  1.5056167304077988
Validation R squared: 0.7344463822454461
Early stopping round  2
Model:  ridge
Test MSE:  1.489034678381882
R squared: 0.7336687436093056


Model:  linear
Round  1
Training MSE:  1.4554183929322493
Training R squared: 0.7440291044667766
Validation MSE:  1.5059781317620036
Validation R squared: 0.7343826399695188
Model:  linear
Round  2
Training MSE:  1.4554183929322493
Training R squared: 0.7440291044667766
Validation MSE:  1.5059781317620036
Validation R squared: 0.7343826399695188
Early stopping round  2
Model:  linear
Test MSE:  1.4910839786247498
R squared: 0.7333022023082004


Model:  lasso
Round  1
Training MSE:  5.685874520618479
Training R squared: 0.0
Validation MSE:  5.6702428

In [None]:
from joblib import load
loaded_pipeline = load('best_regression_model.joblib')