In [1]:
import datetime
import difflib

import numpy as np
import pandas as pd
from io import StringIO
from textblob import TextBlob

In [2]:
with open("/home/jason/bot_model/training-dump.csv") as f:
    df = pd.read_csv(
        f,
        sep=",",
        dtype={
            "banned_by": str,
            "no_follow": bool,
            "link_id": str,
            "gilded": bool,
            "author": str,
            "author_verified": bool,
            "author_comment_karma": np.float64,
            "author_link_karma": np.float64,
            "num_comments": np.float64,
            "created_utc": np.float64,
            "score": np.float64,
            "over_18": bool,
            "body": str,
            "downs": np.float64,
            "is_submitter": bool,
            "num_reports": np.float64,
            "controversiality": np.float64,
            "quarantine": str,
            "ups": np.float64,
            "is_bot": bool,
            "is_troll": bool,
            "recent_comments": str,
        },
    )

In [3]:
# delete columns that have missing data or won't have meaningful values in real-time data
columns = [
    "banned_by",
    "downs",
    "quarantine",
    "num_reports",
    "num_comments",
    "score",
    "ups",
    "controversiality",
    "gilded",
]
df.drop(columns, inplace=True, axis=1)

# drop duplicates
df.drop_duplicates(inplace=True)

# format columns
df["created_utc"] = pd.to_datetime(df["created_utc"].values, unit="s")
df["body"] = df["body"].str.slice(stop=200).fillna("")

# add our new stats columns
df["recent_num_comments"] = pd.Series(np.zeros(len(df.index), np.int64))
df["recent_num_last_30_days"] = pd.Series(np.zeros(len(df.index), np.int64))
df["recent_avg_no_follow"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_avg_gilded"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_avg_responses"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_percent_neg_score"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_avg_score"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_min_score"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_avg_controversiality"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_avg_ups"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_avg_diff_ratio"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_max_diff_ratio"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_avg_sentiment_polarity"] = pd.Series(np.zeros(len(df.index), np.float64))
df["recent_min_sentiment_polarity"] = pd.Series(np.zeros(len(df.index), np.float64))

In [4]:
# Count num of bots and trolls
bots = df['is_bot']
trolls = df['is_troll']
normies = df[(df.is_bot == False) & (df.is_troll == False)]
print("Number of bot comments: ", bots.sum())
print("Number of troll comments:", trolls.sum())
print("Number of normal comments:", len(normies))

bot_authors = df[df['is_bot'] == True][['author']]
troll_authors = df[df['is_troll'] == True][['author']]
print("Number of bot authors: ", len(np.unique(bot_authors)))
print("Number of troll authors:", len(np.unique(troll_authors)))

# Num of users
users = df['author'].values
num_of_users = np.unique(users)
print("Number of total authors: ", len(num_of_users))

# Set fractions between the user classes
print("\nFixing ratios between classes")
data = df[df['is_troll']]
df = data.append(df[df['is_bot']].sample(n=len(data)*2))

Number of bot comments:  220534
Number of troll comments: 6562
Number of normal comments: 0
Number of bot authors:  343
Number of troll authors: 153
Number of total authors:  496

Fixing ratios between classes


In [5]:
num = 0


def diff_ratio(_a, _b):
    return difflib.SequenceMatcher(a=_a, b=_b).ratio()


def last_30(a, b):
    return a - datetime.timedelta(days=30) < pd.to_datetime(b, unit="s")


def calc_stats(comment):
    # track progress
    global num
    num += 1
    if num % 1000 == 0:
        print(num)

    recent_comments = pd.read_json(
        StringIO(comment["recent_comments"]),
        dtype={
            "banned_by": str,
            "no_follow": bool,
            "link_id": str,
            "gilded": np.float64,
            "author": str,
            "author_verified": bool,
            "author_comment_karma": np.float64,
            "author_link_karma": np.float64,
            "num_comments": np.float64,
            "created_utc": np.float64,
            "score": np.float64,
            "over_18": bool,
            "body": str,
            "downs": np.float64,
            "is_submitter": bool,
            "num_reports": np.float64,
            "controversiality": np.float64,
            "quarantine": bool,
            "ups": np.float64,
        },
    )
    comment["recent_num_comments"] = len(recent_comments)

    if len(recent_comments) > 0:
        comment["recent_num_last_30_days"] = (
            recent_comments["created_utc"]
            .apply(lambda x: last_30(comment["created_utc"], x))
            .sum()
        )
        comment["recent_avg_no_follow"] = recent_comments["no_follow"].mean()
        comment["recent_avg_gilded"] = recent_comments["gilded"].mean()
        comment["recent_avg_responses"] = recent_comments["num_comments"].mean()
        comment["recent_percent_neg_score"] = (
            recent_comments["score"].apply(lambda x: x < 0).mean() * 100
        )
        comment["recent_avg_score"] = recent_comments["score"].mean()
        comment["recent_min_score"] = recent_comments["score"].min()
        comment["recent_avg_controversiality"] = recent_comments[
            "controversiality"
        ].mean()
        comment["recent_avg_ups"] = recent_comments["ups"].mean()
        diff = (
            recent_comments["body"]
            .str.slice(stop=200)
            .fillna("")
            .apply(lambda x: diff_ratio(comment["body"], x))
        )
        comment["recent_avg_diff_ratio"] = diff.mean()
        comment["recent_max_diff_ratio"] = diff.max()
        scores = (
            recent_comments["body"]
            .append(pd.Series(comment["body"]))
            .apply(lambda x: TextBlob(x).sentiment.polarity)
        )
        comment["recent_avg_sentiment_polarity"] = scores.mean()
        comment["recent_min_sentiment_polarity"] = scores.min()

    return comment


new_df = df.apply(calc_stats, axis=1)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000


In [8]:
# delete NA values
new_df = new_df[new_df.recent_min_sentiment_polarity.isna() == False]

In [9]:
def setTarget(x):
    if x.is_bot:
        return "bot"
    elif x.is_troll:
        return "troll"
    else:
        return "normal"


# Create one column with the target training label
new_df["target"] = new_df.apply(lambda x: setTarget(x), axis=1)

# Delete is_bot and is_troll collumns and add targets column
columns = ["is_bot", "is_troll"]
new_df.drop(columns, inplace=True, axis=1)

# Delete recent_comments to save space
columns = ["recent_comments"]
new_df.drop(columns, inplace=True, axis=1)

new_df.to_csv("/home/jason/bot_model/my_clean_data_training.csv", sep=",", index=False)
print("The data cleaning finished correctly!!!")

The data cleaning finished correctly!!!
