<a href="https://colab.research.google.com/github/jinuElsa/project/blob/main/projectfinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Liberaries

In [4]:
import pandas as pd
import numpy as np
import sys
from sklearn.pipeline import Pipeline, FeatureUnion 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import confusion_matrix, classification_report

ModuleNotFoundError: ignored

In [5]:
# import the three csv documents
comments = pd.read_csv('attack_annotated_comments.csv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.csv',  sep = '\t')
demographics = pd.read_csv('attack_worker_demographics.csv', sep = '\t')

In [6]:
# Join the demographic data to the annotations data, then blow it
# out into boolean series (technically 1s and 0s) and drop any irrelevant or pre-blown-out columns
annotWithDemo = pd.merge(annotations, demographics, on='worker_id', how='left')
annotWithDemo.drop(columns=['quoting_attack','recipient_attack','third_party_attack',
                            'other_attack', 'worker_id'], inplace=True)
boolCols = annotWithDemo.join(annotWithDemo.gender.str.get_dummies())
boolCols.drop(columns='gender',inplace=True)
boolCols = boolCols.join(boolCols.age_group.str.get_dummies())
boolCols.drop(columns='age_group',inplace=True)
boolCols = boolCols.join(boolCols.education.str.get_dummies())
boolCols.drop(columns='education',inplace=True)
boolCols['no_degree'] = boolCols['none'] + boolCols['some'] + boolCols['hs']
boolCols['college_degree'] = boolCols['bachelors'] + boolCols['doctorate'] + boolCols['masters'] + boolCols['professional']
boolCols.drop(columns=['bachelors','doctorate','hs','masters','none','professional','some'],
              inplace=True)

KeyError: ignored

In [None]:
# Create a data frame containing only reviews with at least one attack identified
# Group both data frames by rev_id, the "attack only" frame will be used as the numerator in
# finding pctg of each demographic column that labeled a review an attack
boolColsAttackOnly = boolCols.loc[boolCols['attack'] > 0]
boolColsAttackOnlyGrouped = boolColsAttackOnly.groupby('rev_id', as_index=False).sum()
boolColsGrouped = boolCols.groupby('rev_id', as_index=False).sum()

In [None]:
# Combine the demographic columns into percentages and find the overall pct of
# annotators marking a comment as an attack to aid in classifying comments
allRev= boolColsGrouped['rev_id'].to_frame("rev_id")
allRevAttackOnlyGrouped = pd.merge(allRev, boolColsAttackOnlyGrouped, on='rev_id', how='left')
demo = allRevAttackOnlyGrouped.loc[:,'english_first_language':].div(boolColsGrouped.loc[:,'english_first_language':])
totalAnnotators = boolCols.groupby('rev_id', as_index=False).count()['attack']
attack = boolColsGrouped['attack'].div(totalAnnotators).to_frame('pctAttack')

In [None]:
# Find the max demographic percentage that advocated for attack in each row and add it to the attack
# dataframe. Create an attack column for the target labels and flip any rows meeting the criteria
# to True. Insert the rev_id column into the attack frame 
demoMax = demo.loc[:,'english_first_language':].max(axis = 1)
attack.insert(1,'demoMax',demoMax)
attack['attack'] = False
attack.loc[(attack['pctAttack'] >= .5) | (attack['demoMax'] > .5), 'attack'] = True
attack.loc[attack['pctAttack'] <.25,'attack'] = False
attack.insert(0,'rev_id',boolColsGrouped['rev_id'])
labels = attack.drop(columns=['demoMax', 'pctAttack'])

In [None]:
# Create the labels data frame by dropping irrelevant columns from the attack frame and merge the
# labels into the comments dataframe to complete labeling all comments
labels = attack.drop(columns=['demoMax', 'pctAttack'])
comments = pd.merge(comments, labels, on='rev_id', how='left')

 Cleaning the data

In [None]:
# remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [None]:
# Split and encode the training data
X_train,X_test,y_train,y_test = train_test_split(
    comments.comment, comments.attack, test_size=.33,random_state=42)

encode = LabelEncoder()
y_train = encode.fit_transform(y_train)
y_test = encode.fit_transform(y_test)

In [None]:
# define the parameter grid, I kept it separate for ease in tweaking values:

parameterGrid = dict(
    features__word__max_features=[10000],
    features__word__ngram_range=[(1,2)],
    features__word__lowercase=[True],
    features__word__stop_words=['english'],
    features__word__strip_accents=['unicode'],
    
    features__char__max_features=[25000],
    features__char__ngram_range=[(2,3)],
    features__char__lowercase=[True],
    features__char__strip_accents=['unicode'],
    clf__loss=['modified_huber'],
    clf__alpha=[.0001],
    clf__learning_rate=['optimal'],
    clf__eta0=[.001]
    
)

In [None]:
# Setup classifier
clf = SGDClassifier(verbose = 51) #Verbosity over 50 prints the entire log as it is fitted
wVector = TfidfVectorizer(analyzer='word')
cVector = TfidfVectorizer(analyzer='char')
fUnion = FeatureUnion([("word", wVector), ("char", cVector)])

pipe = Pipeline([
    ('features', fUnion),
    ('clf', clf)
])

grid_search = GridSearchCV(pipe, param_grid=parameterGrid, n_jobs=6, pre_dispatch=4,
                            verbose=51,cv=3, scoring='f1')

In [None]:
# Train the model
grid_search.fit(X_train,y_train)

In [None]:
# Classification Report
y_valid_pred = grid_search.best_estimator_.predict(X_test)
met = classification_report(y_test, y_valid_pred)
print(met)

In [None]:
# Confusion Matrix: Y-axis is what was predicted by the model, X-axis is what it should be
conf_mat = confusion_matrix(y_test, y_valid_pred)
print(conf_mat)

In [None]:
# Lists best parameters from the grid search, borrowed from lecture code:
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameterGrid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
sys.stdout.flush()