# Gender Based Violence Classification

Scratch work notebook

In [35]:
# Imports
import pandas as pd
import numpy as np
from scipy.sparse import vstack

import string
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Global settings
pd.set_option('display.max_colwidth', None)

### Load Data

In [3]:
# Read csv
train_df = pd.read_csv('../data/Train.csv')
test_df = pd.read_csv('../data/Test.csv')
extra_df = pd.read_csv('../data/ExtraTweets.csv')

### Data Cleaning

In [4]:
neutral_df = extra_df.loc[extra_df['sentiment'] == 'neutral']
neutral_df.rename(columns={'text':'tweet'}, inplace=True)
neutral_df.dropna(inplace=True)

In [5]:
def preprocess_string(text: string):
    # Punctuation
    text = re.sub(r'[^\w\s]','',text)
    #text = ''.join([char for char in text if char not in string.punctuation])
    # Numbers
    text = ''.join([char for char in text if not char.isdigit()])
    # Lowercase
    text = text.lower()
    # Tokenize
    text = text.split()
    # Stopwords
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words]
    # Stemming
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text]
    # Join
    text = ' '.join(text)
    
    return text

def preprocess_df(df: pd.DataFrame):
    df['preprocessed_tweet'] = df['tweet'].apply(lambda x: preprocess_string(x))
    df['preprocessed_tweet'] = df['tweet'].apply(lambda x: preprocess_string(x))
    return df

In [6]:
train_df = preprocess_df(train_df)
test_df = preprocess_df(test_df)
neutral_df = preprocess_df(neutral_df)

In [7]:
train_df['preprocessed_tweet'].head()

0                                                                  dream got rape last night guy work actual guy smoke hous much tryna sexual wasnt even like want smoke
1                                                                                                          thought word rape mean sex told saw dog rape eachoth like wtf
2                                                                                                                        talk rape men molest jail nother charg say word
3                              sexual abus year age one believ rape bro friend classroom told one caus would believ bro found friend brag wrong person hard come forward
4    chessi prout better tell truth sell owen labri hide whoever els dna underwear said never said rape chang chessi rape violat white femal privileg allow platform lie
Name: preprocessed_tweet, dtype: object

### Feature Engineering

In [8]:
#TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=0.01)

# Combine data in training and extra set to train TFIDF
tfidf_train_data = pd.concat([train_df['preprocessed_tweet'], neutral_df['preprocessed_tweet']])

# Fit TFIDF on data
tfidf_trained = tfidf_vectorizer.fit(tfidf_train_data)

# Transform columns
tfidf_train = tfidf_trained.transform(train_df['preprocessed_tweet'])
tfidf_test = tfidf_trained.transform(test_df['preprocessed_tweet'])
tfidf_extra = tfidf_trained.transform(neutral_df['preprocessed_tweet'])

In [9]:
tfidf_extra.shape[0]

11117

In [10]:
# Label encode target
le = LabelEncoder()
train_df['encoded_type'] = le.fit_transform(train_df['type'])
le.classes_.tolist()

['Harmful_Traditional_practice',
 'Physical_violence',
 'economic_violence',
 'emotional_violence',
 'sexual_violence']

In [27]:
# Create final matrices for model usage
X_train = vstack((tfidf_train, tfidf_extra)).toarray()
#X_train = tfidf_train
y_train = train_df['encoded_type'].to_list()
y_train.extend(np.ones(tfidf_extra.shape[0])*5)

### Models

In [32]:
# Grid search to tune hyperparameters
grid = {
    "C": [0.25, 0.5, 0.75, 1],
    "penalty": ["l1", "l2", 'elasticnet'],
    'l1_ratio': [0.25, 0.5, 0.75]
}
grid_logreg = LogisticRegression(class_weight='balanced', multi_class='ovr', n_jobs=-1)
logreg_cv = GridSearchCV(grid_logreg, grid, cv=5, scoring='f1_macro')
logreg_cv.fit(X_train,y_train)
print(logreg_cv.best_params_)

{'C': 1, 'l1_ratio': 0.25, 'penalty': 'l2'}


In [33]:
multi_logreg = LogisticRegression(
    class_weight='balanced', 
    multi_class='ovr', 
    n_jobs=-1,
    max_iter=1000,
    C=1,
    penalty='l2',
)
multi_logreg.fit(X_train, y_train)
y_pred = multi_logreg.predict(X_train)

In [34]:

print(classification_report(y_train, y_pred, target_names=le.classes_.tolist()+['neutral']))

                              precision    recall  f1-score   support

Harmful_Traditional_practice       0.53      0.91      0.67       188
           Physical_violence       0.98      0.99      0.99      5946
           economic_violence       0.69      1.00      0.82       217
          emotional_violence       0.85      0.99      0.91       651
             sexual_violence       1.00      0.98      0.99     32648
                     neutral       0.97      0.99      0.98     11117

                    accuracy                           0.98     50767
                   macro avg       0.84      0.98      0.89     50767
                weighted avg       0.99      0.98      0.98     50767



In [36]:
multilabel_confusion_matrix(y_train, y_pred)

array([[[50427,   152],
        [   17,   171]],

       [[44713,   108],
        [   35,  5911]],

       [[50453,    97],
        [    0,   217]],

       [[50004,   112],
        [    9,   642]],

       [[18103,    16],
        [  616, 32032]],

       [[39313,   337],
        [  145, 10972]]])

In [54]:
tfidf_vectorizer.vocabulary_

{'got': 94,
 'rape': 179,
 'last': 124,
 'night': 159,
 'guy': 96,
 'work': 262,
 'actual': 2,
 'hous': 109,
 'much': 152,
 'sexual': 197,
 'wasnt': 247,
 'even': 66,
 'like': 131,
 'want': 246,
 'thought': 225,
 'word': 261,
 'mean': 146,
 'sex': 196,
 'told': 229,
 'saw': 191,
 'talk': 217,
 'men': 147,
 'jail': 117,
 'say': 192,
 'abus': 0,
 'year': 269,
 'age': 3,
 'one': 164,
 'believ': 23,
 'friend': 84,
 'caus': 37,
 'would': 263,
 'found': 83,
 'wrong': 265,
 'person': 168,
 'hard': 99,
 'come': 42,
 'better': 25,
 'tell': 218,
 'said': 190,
 'never': 156,
 'chang': 38,
 'white': 253,
 'lie': 129,
 'ye': 267,
 'women': 259,
 'also': 6,
 'yet': 270,
 'that': 220,
 'man': 142,
 'someth': 205,
 'happen': 98,
 'husband': 111,
 'beat': 21,
 'wife': 255,
 'court': 46,
 'sure': 215,
 'yr': 273,
 'old': 163,
 'girl': 88,
 'god': 91,
 'find': 79,
 'wont': 260,
 'back': 18,
 'time': 227,
 'peopl': 167,
 'ago': 4,
 'tw': 235,
 'sorri': 207,
 'hear': 103,
 'yeah': 268,
 'threaten': 226,
 '

In [85]:
multi_logreg.coef_[5].argsort()[-5:]

array([ 90, 262, 157, 228, 135])

In [90]:
vocab = list(tfidf_vectorizer.vocabulary_)
vocab[135]

'turn'