In [3]:
import os
import re
import glob
import pickle
import string

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline

In [4]:
pd.set_option('display.max_rows', 10)

In [5]:
# input path
SUPPORT_PATH = "../data/supports"

In [6]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [None]:
d_199 = read_pickle(os.path.join(SUPPORT_PATH, "parsed_199.pkl"))
d_6725 = read_pickle(os.path.join(SUPPORT_PATH, "parsed_7003.pkl"))
d_tweets = pd.concat([d_199, d_6725], axis = 0, sort=False)
d_tweets.drop_duplicates(subset="id_tweet", inplace=True)
d_tweets.reset_index(drop=True, inplace=True)

In [None]:
d_label = pd.read_csv("../data/account_labeled/project_12_labels_Thu_Oct_15_2020.csv")

In [None]:
d_label.head()

In [None]:
d_label.shape

In [None]:
d_tweets.head()

In [None]:
d_tweets.shape

In [None]:
d_tweets[d_tweets.screen_name.isin(['urmila__011'])]

In [None]:
# parsing label
d_label.rename(columns={'Label': "label"}, inplace=True)
d_label['url_profile'] = d_label.Text.apply(lambda x: x.split()[0].strip())
d_label['username'] = d_label.url_profile.apply(lambda x: x.split('/')[-1])
d_label = d_label[["username", "label"]]

In [None]:
# 3258 accounts are labeled, the rest are excluded before parsing
d_tweets[d_tweets.screen_name.isin(d_label.username)].screen_name.unique().shape

In [None]:
d_dataset = d_tweets.groupby('screen_name')['full_text'].apply(list)
d_dataset = d_dataset.reset_index()
# join dataset and label
d_dataset = d_dataset.join(d_label.set_index('username'), on='screen_name')
d_dataset = d_dataset.rename(columns={'Label':'label'})
# remove label inactive
d_dataset = d_dataset[d_dataset.label != 'inactive']

In [None]:
d_dataset.loc[:, 'num_tweets'] = d_dataset.full_text.apply(lambda x : len(x))

In [None]:
# using 10 samples
d_dataset['text_used'] = d_dataset.full_text.apply(lambda x: " ".join(x[:10]))

In [None]:
def text_cleansing(title):
    punctuation = '!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~'
    table = str.maketrans(punctuation, ' '*len(punctuation)) #map punctuation to space
    
    # parse hashtag
    title = re.sub(r"([a-z])([A-Z])", r"\1 \2", title)
    # lowercase
    title = title.lower()
    # convert hyperlinks to link
    title = re.sub('http(s):/\/\\S+', '<LINK> ', title)
    # convert @username to username
    title = re.sub('@\w+', '<USERNAME>', title)
    # remove punctuation
    title = title.translate(table)
    # only take string started with alphanum
    title = re.sub("[^(\w|\<\>)]", ' ', title)
    # remove double whitespaces
    title = re.sub('\s+', ' ', title)
    # remove double whitespaces
    title = title.strip()
    
    return title

In [None]:
d_dataset['preprocessed_text'] = d_dataset.text_used.apply(text_cleansing)

In [None]:
d_dataset.loc[0, 'text_used']

In [None]:
d_dataset.loc[0, 'preprocessed_text']

In [None]:
d_train = d_dataset.dropna()
d_test = d_dataset[d_dataset.label.isna()]

In [None]:
d_train = d_train[["screen_name", "full_text", "text_used", "preprocessed_text", "label"]]

In [None]:
d_train.loc[:, 'label_encoded'] = d_train.label.map({'buzzer': 1, 'non-buzzer': 0})

In [None]:
d_train.head()

In [None]:
d_train.label_encoded.value_counts().plot(kind='pie', autopct='%.2f')

### Exprimenting with simple algorithm

In [None]:
tfidf = TfidfVectorizer()

In [None]:
text_tfidf = tfidf.fit_transform(d_train.preprocessed_text)

In [None]:
text_tfidf.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_tfidf, d_train.label_encoded, test_size=0.2, random_state=123)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
def scoring(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"accuracy: {acc:.2f} | precision: {pre:.2f} | recall: {rec:.2f} | f score: {f1:.2f}")

In [None]:
model = BernoulliNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = SVC(gamma='scale', kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = GradientBoostingClassifier(n_estimators=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = AdaBoostClassifier(n_estimators=400)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)