In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [2]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

In [3]:
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [4]:
comments['attack'] = labels

In [5]:
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
# comments['comment'] = comments['comment'].str.replace('\s\s+', ' ')
# comments['comment'] = comments['comment'].str.replace('[^\w\s.]', '')

In [None]:
comments['logged_in'] = comments['logged_in'].apply(lambda x: 1 if x == 'False' else 0)
comments['ns'] = comments['ns'].apply(lambda x: 1 if x == 'user' else 0)
comments['sample'] = comments['sample'].apply(lambda x: 1 if x == 'blocked' else 0)

encode_elements = ['logged_in', 'ns', 'sample']
comments['composite'] = comments[encode_elements].sum(axis=1) + 1

In [6]:
enc = LabelEncoder()
# Create integer values for logged in values
# Encode the column values in dataframe
encode_elements = ['logged_in', 'ns', 'sample']
for elem in encode_elements:
    enc.fit(comments[elem])
    comments[elem] = enc.transform(comments[elem])

In [None]:
ngram = 1
max_features = 10000
max_df = 1.0
min_df = 1
lowercase = True
transformer_norm = 'l2'
criterion = 'gini'

In [None]:
comment_pipe = Pipeline([
    ('vect', CountVectorizer(max_features = max_features, ngram_range = (1,ngram), max_df = max_df, min_df = min_df,
    lowercase = lowercase)),
    ('tfidf', TfidfTransformer(norm = transformer_norm))
])

In [None]:
clf = DecisionTreeClassifier()

In [None]:
clf.fit(comments[['logged_in']], comments['attack'])
met = metrics.classification_report(comments['attack'], clf.predict(comments[['logged_in']]))
print(met)

In [None]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features = max_features, ngram_range = (1,ngram), max_df = max_df, min_df = min_df,
    lowercase = lowercase)),
    ('tfidf', TfidfTransformer(norm = transformer_norm)),
    ('clf', DecisionTreeClassifier()),
])

In [None]:
kf = KFold(n_splits=3)
for train, test in kf.split(comments):
    clf.fit(comments.iloc[train]['comment'], comments.iloc[train]['attack'])
    met = metrics.classification_report(comments.iloc[test]['attack'], clf.predict(comments.iloc[test]['comment']), output_dict=True)
    confusion = metrics.confusion_matrix(comments.iloc[test]['attack'], clf.predict(comments.iloc[test]['comment']))
    df = pd.DataFrame(met).transpose()
    with open('results.tsv', 'a') as f:
        df.to_csv(f, mode='a', header=f.tell()==0, sep='\t')