In [1]:
import json
import pandas as pd
import glob
import re
import spacy

from tqdm import tqdm
tqdm.pandas(desc="my bar!")

from gensim.matutils import jaccard
from strsimpy.levenshtein import Levenshtein

  from pandas import Panel


## Get token

In [2]:
def tokenizer(text):
    output = []
    list_token = text.split('_')
    list_token = list(filter(None, list_token))
    list_token = [token[1:] if token.startswith('@') else token for token in list_token]
    
    pattern1 = r"([A-Z][a-z])"
    pattern2 = r"([A-Z]{2,})"
    insert_space = r" \1"
    
    step1 = [re.sub(pattern1, insert_space, token) for token in list_token]
    step2 = [re.sub(pattern2, insert_space, token).split() for token in step1]
    
    for i in step2:
        output += i
        
    return output

In [3]:
def jaccard_sim_feature(df, spwd):    
    out = []

    for _, row in tqdm(df.iterrows()):
        jaccard_user = []
        for user_token in row['token']:
            jaccard_val = []
            for word in spwd['social political']:
                jaccard_val.append(jaccard(user_token, word))

            jaccard_user.append(min(jaccard_val))
        out.append(min(jaccard_user))
    return out

In [4]:
def levenshtein_feature(df, spwd):
    out = []
    levenshtein = Levenshtein()
    
    for _,row in tqdm(df.iterrows()):
        levenshtein_user = []
        for user_token in row['token']:
            levenshtein_val = []
            for word in spwd['social political']:
                levenshtein_val.append(levenshtein.distance(user_token, word))

            levenshtein_user.append(min(levenshtein_val))
        out.append(min(levenshtein_user))
    return out

In [5]:
def create_features(dataset_path, spwd_path):
    df_dataset = pd.read_csv(dataset_path)
    spwd = pd.read_csv(spwd_path, usecols = ['social political'])
    
    df_dataset['token'] = df_dataset['text'].progress_apply(tokenizer)
    
    df_dataset['char_count'] = df_dataset.text.progress_apply(len)
    df_dataset['n_token'] = df_dataset.token.progress_apply(len)
    df_dataset['jaccard_sim'] = jaccard_sim_feature(df_dataset, spwd)
    df_dataset['levenshtein_dist'] = levenshtein_feature(df_dataset, spwd)
    
    return df_dataset

In [6]:
dataset_path = '../data/dataset/short_text_dataset.csv'
spwd_path = '../data/SPWD.csv'

In [7]:
features = create_features(dataset_path, spwd_path)

my bar!: 100%|██████████| 12201/12201 [00:00<00:00, 100888.94it/s]
my bar!: 100%|██████████| 12201/12201 [00:00<00:00, 1003838.90it/s]
my bar!: 100%|██████████| 12201/12201 [00:00<00:00, 964068.86it/s]
12201it [00:58, 206.95it/s]
12201it [06:10, 32.96it/s]


In [14]:
features.to_csv('../data/dataset/short_text_feat.csv', index=False)