In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn import *
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
# Data Processing
# params: String file
# return: DataFrame df
def import_data(file):
    df = pd.read_csv(file).drop_duplicates()
    # randomly sample 3000 hate speech
    hate = df.query('Class == "hate"').sample(n=3000).copy() 
    # randomly sample 3000 non-hate speech
    non_hate = df.query('Class == "none"').sample(n=3000).copy()
    # merge hate and non-hate
    df = pd.merge(hate, non_hate, how='outer').copy()

    return df

# Data Set Information Reporting
# params: DataFrame df
def data_stat(df):
    length = len(df)
    h_length = len(df.query('Class == "hate"'))
    n_length = len(df.query('Class == "none"'))
    print("\nNumber of samples:", length,"\nNumber of Hate Samples:", h_length, "\nNumber of non-Hate Samples:", n_length)

# Randomized Re-Annotation Sampling
# params: Dataframe df, float fn, float fp
# return: DataFrame rdf
def rand_annot_samp(df, fn, fp):

    rdf = df.copy()

    fn_tweets = df.query('Class == "hate"').sample(frac=fn).Tweets.tolist()
    fp_tweets = df.query('Class == "none"').sample(frac=fp).Tweets.tolist()

    for t in fn_tweets:
        rdf.loc[df["Tweets"] == t, "Class"] = "none"
    for t in fp_tweets:
        rdf.loc[df["Tweets"] == t, "Class"] = "hate"

    return rdf

''' TO-DO '''
# Intelligent Re-Annotation Sampling
# params: Dataframe df, float fn, float fp
# return: DataFrame idf
def inte_annot_samp(df, tweets):

    W_NORM = np.linalg.norm(clf.coef_)

    # iterate and calculate distance from tweet to model hyperplane
    # sort tweets by ascending order i.e. closest to hyperplane have higher priority
    # sample fp % of fp tweets and re-annotate
    # sample fn % of fn tweets and re-annotate
    # return updated df

    idf = df.copy()

    dist = []

    # generating distance list
    for tweet in tweets.iteritems():
        x = vectorizer.fit_transform([tweet[1]])
        x = x.toarray()

        zeros = [[0] for l in range(0,5000 - len(x[0]))]
        
        x = np.append(x,zeros)
        x = np.reshape(x,(1,-1))

        dist.append((clf.decision_function(x) / W_NORM)[0])

    # creating a new dict with tweets and their distance
    # fn_dict = {"Tweets" : fn_tweets.tolist(), "Distance" : fn_dist}
    # fp_dict = {"Tweets" : fp_tweets.tolist(), "Distance" : fp_dist}
    dict = {"Tweets" : tweets.tolist(), "Distance" : dist}

    # creating a new df with tweets and their distance
    # fn_df = pd.DataFrame.from_dict(fn_dict)
    # fp_df = pd.DataFrame.from_dict(fp_dict)
    new_df = pd.DataFrame.from_dict(dict)

    # sorting by distance in ascending order
    # fn_df.sort_values(by="Distance", inplace=True)
    # fp_df.sort_values(by="Distance", inplace=True)
    new_df.sort_values(by="Distance", inplace=True)
    new_df.head()
    # extracting sorted tweet order
    # fn_tweets = fn_df.Tweets.tolist()
    # fp_tweets = fp_df.Tweets.tolist()
    t = new_df.Tweets.tolist()

    # re-annotating
    # for i in range(0, FN_AMT):
    #     idf.loc[df["Tweets"] == fn_tweets[i], "Class"] = "none"
    # for i in range(0, FP_AMT):
    #     idf.loc[df["Tweets"] == fp_tweets[i], "Class"] = "hate"

    # while distance < 0
    # re-annotate tweets
    # print i as number of re-annotations

    final = pd.merge(idf, new_df, how="outer")
    final.sort_values(by="Distance", inplace=True)
    
    i = 0
    for index, row in final.iterrows():
        tweet = row["Tweets"]
        dist = row["Distance"]

        if (dist >= 0):
            break

        if (row["Class"] == "hate"):
            print(dist, row["Class"])
            idf.loc[idf["Tweets"] == tweet, "Class"] = "none"
        if (row["Class"] == "none"):
            print(dist, row["Class"])
            idf.loc[idf["Tweets"] == tweet, "Class"] = "hate"
        i += 1
        
    print("\nTotal Amount of Re-annotations:", i)
    return idf

In [3]:
# Data Processing
df = import_data("dataset.csv")

# Data Set Info
data_stat(df)

# Data Split
X_train, X_test, Y_train, Y_test = train_test_split(df.Tweets, df.Class, test_size=0.2)

vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000)

train_data_features = vectorizer.fit_transform(X_train)
train_data_features = train_data_features.toarray()

test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()

# Classifier
clf=svm.SVC(kernel='linear', C=1.0)

print ("\nTraining Baseline SVM")
clf.fit(train_data_features,Y_train)

print ("\nTesting Baseline SVM")
predicted=clf.predict(test_data_features)

accuracy=np.mean(predicted==Y_test)

# TO-DO: Format Accuracy Str
print ("\nBaseline Accuracy: ",accuracy) 

t_dict = {"Tweets" : X_test, "Actual" : Y_test, "Predicted" : predicted}
df2 = pd.DataFrame(t_dict).reset_index()

fn_tweets = df2.query('Actual == "hate" and Actual != Predicted').Tweets.copy()
fp_tweets = df2.query('Actual == "none" and Actual != Predicted').Tweets.copy()

fn = len(fn_tweets) / len(df2)
fp = len(fp_tweets) / len(df2)



tweets = fn_tweets.append(fp_tweets) 
tweets = tweets.tolist()


Number of samples: 6000 
Number of Hate Samples: 3000 
Number of non-Hate Samples: 3000

Training Baseline SVM

Testing Baseline SVM

Baseline Accuracy:  0.8183333333333334


  tweets = fn_tweets.append(fp_tweets)


In [9]:

print(int(fp * 3000) + )

544


In [11]:

W_NORM = np.linalg.norm(clf.coef_)
dist = []
idf = df.copy()

for t in tweets:
    x = vectorizer.fit_transform([t])
    x = x.toarray()

    zeros = [[0] for l in range(0,5000 - len(x[0]))]
    
    x = np.append(x,zeros)
    x = np.reshape(x,(1,-1))

    dist.append((clf.decision_function(x) / W_NORM)[0])

dict = {"Tweets" : tweets, "Distance" : dist}

new_df = pd.DataFrame.from_dict(dict)

final = pd.merge(df, new_df)

final.sort_values(by="Distance", inplace=True)

p_to_n = 0
n_to_p = 0
for index, row in final.iterrows():
    tweet = row["Tweets"]
    dist = row["Distance"]

    if (row["Class"] == "hate" and p_to_n != int(fn * 3000)):
        print(tweet, dist, idf.loc[df["Tweets"] == tweet, "Class"])
        idf.loc[idf["Tweets"] == tweet, "Class"] = "none"
        print(tweet, dist, idf.loc[df["Tweets"] == tweet, "Class"])
        p_to_n += 1
    elif (row["Class"] == "none" and n_to_p != int(fp * 3000)):
        print(tweet, dist, idf.loc[df["Tweets"] == tweet, "Class"])
        idf.loc[idf["Tweets"] == tweet, "Class"] = "hate"
        print(tweet, dist, idf.loc[df["Tweets"] == tweet, "Class"])
        n_to_p += 1
   
    if n_to_p == int(fp * 3000) and p_to_n == int(fn * 3000):
       break
        
print("\nTotal Amount of Re-annotations:", int(fp * 3000) + int(fn * 3000))

No one even likes you Kat &amp; Andre. You won't last long...#mkr -0.01934744600296756 712    hate
Name: Class, dtype: object
No one even likes you Kat &amp; Andre. You won't last long...#mkr -0.01934744600296756 712    none
Name: Class, dtype: object
@UseUrTongue You need serious help, Gags.  I hope you find it. http://t.co/W1u9n2sjMh -0.01934744600296756 1027    hate
Name: Class, dtype: object
@UseUrTongue You need serious help, Gags.  I hope you find it. http://t.co/W1u9n2sjMh -0.01934744600296756 1027    none
Name: Class, dtype: object
#mkr No No No No No No -0.003256843470238287 4413    none
Name: Class, dtype: object
#mkr No No No No No No -0.003256843470238287 4413    hate
Name: Class, dtype: object
I'm slowly finding people that I trust, but it's all so hard because I care *so much*. 0.00111988960130944 5989    none
Name: Class, dtype: object
I'm slowly finding people that I trust, but it's all so hard because I care *so much*. 0.00111988960130944 5989    hate
Name: Class, dtyp

In [12]:

data_stat(idf)

# Data Split
i_X_train, i_X_test, i_Y_train, i_Y_test = train_test_split(idf.Tweets, idf.Class, test_size=0.2)

i_vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000)

i_train_data_features = i_vectorizer.fit_transform(i_X_train)
i_train_data_features = i_train_data_features.toarray()

i_test_data_features = i_vectorizer.transform(i_X_test)
i_test_data_features = i_test_data_features.toarray()

# Classifier
i_clf = svm.SVC(kernel='linear', C=1.0)

print("\nTraining Intelligent Sampling SVM")
i_clf.fit(i_train_data_features,i_Y_train)

print("\nTesting Intelligent Sampling SVM")
i_predicted = i_clf.predict(i_test_data_features)

i_accuracy = np.mean(i_predicted==i_Y_test)
# TO-DO: Format Accuracy Str
print ("\nIntelligent Accuracy: ",i_accuracy)  



Number of samples: 6000 
Number of Hate Samples: 2959 
Number of non-Hate Samples: 3041

Training Intelligent Sampling SVM

Testing Intelligent Sampling SVM

Intelligent Accuracy:  0.8841666666666667
