# Install library

In [None]:
!pip install pandas
!pip install openpyxl
!pip install gensim
!pip install SPARQLWrapper
!pip install xlrd==1.2.0
!pip install nltk
!pip install python-crfsuite

# Import library

In [1]:
import sys
import re
import pandas as pd
import numpy as np
from gensim.models import FastText
from gensim.models import KeyedVectors
from SPARQLWrapper import SPARQLWrapper, JSON
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tag import CRFTagger
from collections import Counter
import pickle

stopword_set = set(stopwords.words('indonesian'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read data

In [2]:
# sample data

path = '../example-data/'
df_prof = pd.read_excel(path + 'df_structured.xlsx')
df_prof.info()
display(df_prof.head())

df_tweet = pd.read_csv(path + 'ridwankamil.csv')
df_tweet = df_tweet.drop_duplicates(subset=['tweet'])
df_tweet = df_tweet.sort_values(by='created_at', ascending=False)
df_tweet.info()
df_tweet.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 14 columns):
profile_background_color        1 non-null int64
profile_link_color              1 non-null object
profile_sidebar_border_color    1 non-null object
profile_sidebar_fill_color      1 non-null object
profile_text_color              1 non-null object
bio                             1 non-null object
name                            1 non-null object
username                        1 non-null object
followers                       1 non-null int64
friends_count                   1 non-null int64
listed_count                    1 non-null int64
favourites_count                1 non-null int64
statuses_count                  1 non-null int64
protected                       1 non-null bool
dtypes: bool(1), int64(6), object(7)
memory usage: 185.0+ bytes


Unnamed: 0,profile_background_color,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,bio,name,username,followers,friends_count,listed_count,favourites_count,statuses_count,protected
0,352726,D02B55,829D5E,99CC33,3E4415,Governor of West Java. Broadcaster of Daily Ha...,Ridwan Kamil,ridwankamil,4974848,2842,1547,22037,43924,False


<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 36 columns):
id                 60 non-null int64
conversation_id    60 non-null int64
created_at         60 non-null object
date               60 non-null object
time               60 non-null object
timezone           60 non-null int64
user_id            60 non-null int64
username           60 non-null object
name               60 non-null object
place              0 non-null float64
tweet              60 non-null object
language           60 non-null object
mentions           60 non-null object
urls               60 non-null object
photos             60 non-null object
replies_count      60 non-null int64
retweets_count     60 non-null int64
likes_count        60 non-null int64
hashtags           60 non-null object
cashtags           60 non-null object
link               60 non-null object
retweet            60 non-null bool
quote_url          0 non-null float64
video              60 non-null i

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1545938039058604032,1545937935606116352,2022-07-10 09:08:45 WITA,2022-07-10,09:08:45,800,80323736,ridwankamil,Ridwan Kamil,,...,,,,,,[],,,,
1,1545937935606116352,1545937935606116352,2022-07-10 09:08:20 WITA,2022-07-10,09:08:20,800,80323736,ridwankamil,Ridwan Kamil,,...,,,,,,[],,,,
2,1545371884703072256,1545371884703072256,2022-07-08 19:39:03 WITA,2022-07-08,19:39:03,800,80323736,ridwankamil,Ridwan Kamil,,...,,,,,,[],,,,
3,1544966777628925952,1544966764001640449,2022-07-07 16:49:18 WITA,2022-07-07,16:49:18,800,80323736,ridwankamil,Ridwan Kamil,,...,,,,,,[],,,,
4,1544966767956869120,1544966764001640449,2022-07-07 16:49:16 WITA,2022-07-07,16:49:16,800,80323736,ridwankamil,Ridwan Kamil,,...,,,,,,[],,,,


In [3]:
# vectorizer

path = '../vectorizer/'
name_feat_ia_vectorizer = pickle.load(open(path + 'name_feat_ia_vec.pickle', 'rb'))
username_feat_char_vectorizer = pickle.load(open(path + 'username_feat_char_vec.pickle', 'rb'))
tweet_feat_bow_stop_vectorizer = pickle.load(open(path + 'tweet_feat_bow_stop_vec.pickle', 'rb'))

In [4]:
# crf tagger

path = '../misc/'
ct = CRFTagger()
ct.set_model_file(path + 'all_indo_man_tag_corpus_model.crf.tagger')

# emoticon

f_open = open(path + 'EMOTICON.txt', 'r')
emoticons = f_open.read().split('\n')
f_open.close()

# Function

In [5]:
def get_wikidata_dict(endpoint_url, query):
    user_agent = 'WDQS-example Python/%s.%s' % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def get_name_tokens(df, gender):
    df_temp = df[df.gender == gender]
    arr_res = []
    for d in df_temp['nama']:
        tokens = nltk.word_tokenize(d)
        tokens = [t.lower() for t in tokens]
        arr_res = arr_res + tokens
    return list(set(arr_res))

def get_vec(vectorizer, arr_text):
    return vectorizer.transform(arr_text).toarray()

# Feature Extraction

## Color

In [6]:
# extraction

color_feat = []

color_cols = [
    'profile_background_color',
    'profile_link_color',
    'profile_sidebar_border_color',
    'profile_sidebar_fill_color',
    'profile_text_color'
]

for index, row in df_prof.iterrows():
    pbc = str(row['profile_background_color'])
    plc = str(row['profile_link_color'])
    psbc = str(row['profile_sidebar_border_color'])
    psfc = str(row['profile_sidebar_fill_color'])
    ptc = str(row['profile_text_color'])
    
    rgb1 = [int(pbc[0:2], 16) // 32, int(pbc[2:4], 16) // 32, int(pbc[4:6], 16) // 32]
    rgb2 = [int(plc[0:2], 16) // 32, int(plc[2:4], 16) // 32, int(plc[4:6], 16) // 32]
    rgb3 = [int(psbc[0:2], 16) // 32, int(psbc[2:4], 16) // 32, int(psbc[4:6], 16) // 32]
    rgb4 = [int(psfc[0:2], 16) // 32, int(psfc[2:4], 16) // 32, int(psfc[4:6], 16) // 32]
    rgb5 = [int(ptc[0:2], 16) // 32, int(ptc[2:4], 16) // 32, int(ptc[4:6], 16) // 32]
    
    color_feat.append(sum([rgb1, rgb2, rgb3, rgb4, rgb5], []))

color_feat = np.array(color_feat)
print('All color:', color_feat)

All color: [[1 1 1 6 1 2 4 4 2 4 6 1 1 2 0]]


## Name

In [7]:
# get name dictionary

endpoint_url = 'https://query.wikidata.org/sparql'

query = '''SELECT ?item ?itemLabel ?sexLabel
WHERE {
  ?item wdt:P31 wd:Q5 .
  ?item wdt:P27 wd:Q252 .
  ?item wdt:P21 ?sex .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "id, en". }
}'''

results = get_wikidata_dict(endpoint_url, query)

df_wikidata = pd.DataFrame()
for index, row in pd.DataFrame(results['results']['bindings']).iterrows():
    nama = row['itemLabel']['value']
    gender = row['sexLabel']['value']

    df_wikidata = df_wikidata.append({
        'nama': nama,
        'gender': gender
    }, ignore_index=True)
    
df_wikidata = df_wikidata.drop_duplicates()
display(df_wikidata['gender'].value_counts(dropna=False))
display(df_wikidata.head(1))

tokens_pria = get_name_tokens(df_wikidata, 'laki-laki')
print(len(tokens_pria))
display(tokens_pria[0:5])

tokens_wanita = get_name_tokens(df_wikidata, 'perempuan')
print(len(tokens_wanita))
display(tokens_wanita[0:5])

laki-laki                24742
perempuan                12391
perempuan transgender        7
Name: gender, dtype: int64

Unnamed: 0,gender,nama
0,laki-laki,Triyatno


21164


['ghozali', 'assegaf', 'najib', 'katoppo', 'mochamad']

13245


['jumah', 'fitrika', 'aifan', 'assegaf', 'umami']

In [8]:
# extraction

arr_name = []
dict_name = []

for d in df_prof['name']:
    d = d.lower()
    tokens = nltk.word_tokenize(d)
    
    contains_stopword = len(set(tokens) & (stopword_set - set(['dini', 'hari']))) > 0
    if re.sub(r'[\.\s\'\_\(\)\-\|\/\@]', '', d).isalpha() and not contains_stopword:
        arr_name.append(d)

        count_pria = 0
        count_wanita = 0 

        for t in range(len(tokens)):            
            if tokens[t] in tokens_pria:
                count_pria += 1
            elif tokens[t] in tokens_wanita:
                count_wanita += 1
        dict_name.append([count_pria, count_wanita])
        
    else:
        arr_name.append('')
        dict_name.append([0, 0])
        
print(np.array(arr_name) , np.array(dict_name))

['ridwan kamil'] [[2 0]]


In [9]:
# vectorization

ia = get_vec(name_feat_ia_vectorizer, arr_name)

print(ia)

# concate with name dict feature

ia_with_dict = np.hstack((ia, dict_name))

print(ia_with_dict)

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 2 0]]


## Username

In [10]:
# vectorization

arr_ids = np.array([re.sub(r'\d+', '', d) for d in df_prof['username']])
username_feat_char = get_vec(username_feat_char_vectorizer, arr_ids)

print(username_feat_char)

[[0 0 0 ... 0 0 0]]


## Network

In [15]:
retweets_count = df_tweet[df_tweet['retweets_count'] > 0].shape[0] / df_tweet.shape[0]

network_feat_abl_likes = [np.hstack((df_prof['followers'], df_prof['friends_count'], df_prof['listed_count'], df_prof['favourites_count'], retweets_count))]
print(network_feat_abl_likes)

[array([4.974848e+06, 2.842000e+03, 1.547000e+03, 2.203700e+04,
       1.000000e+00])]


## Behavior

In [11]:
# extraction

hashtags = df_tweet[df_tweet.hashtags != '[]'].shape[0] / df_tweet.shape[0]
photos = df_tweet[df_tweet.photos != '[]'].shape[0] / df_tweet.shape[0]
video = df_tweet[df_tweet.video == 1].shape[0] / df_tweet.shape[0]
days = df_tweet.groupby('date')['tweet'].nunique().mean()

behavior_feat_abl_mention = [np.hstack((hashtags, photos, video, days))]
print(behavior_feat_abl_mention)

[array([0.41666667, 0.41666667, 0.83333333, 2.85714286])]


## Socio

In [28]:
# extraction

word_count_sum = 0
char_count_sum = 0
repeated_alphabets_sum = 0
ellipses_sum = 0
exclamation_sum = 0
upper_words_sum = 0
capitalized_words_sum = 0 
emoticon_sum = 0
adjective_sum = 0
noun_sum = 0

for t in df_tweet['tweet']:
    token = nltk.word_tokenize(t)
    word_count_sum += len(token)
    char_count_sum += len(t)
    repeated_alphabets_sum += len(re.findall(r'([A-Za-z])\1\1', t))
    ellipses_sum += len(re.findall(r'([.])\1', t))
    exclamation_sum += len(re.findall(r'([!])\1', t))

    for w in token:
        if w.isupper():
            upper_words_sum += 1
        if w.istitle():
            capitalized_words_sum += 1

    for e in emoticons:
        emoticon_sum += t.count(e)

    tag = ct.tag_sents([token])
    flat_tag = [item for sublist in tag for item in sublist]
    pos_count = Counter([j for i, j in flat_tag])
    adjective_sum += pos_count['JJ']
    noun_sum += pos_count['NN']
    
socio_feat_abl_verb_count = [np.hstack((word_count_sum, char_count_sum, repeated_alphabets_sum, ellipses_sum, exclamation_sum, upper_words_sum, capitalized_words_sum, emoticon_sum, adjective_sum, noun_sum))]
print(socio_feat_abl_verb_count)

[array([ 2503, 15250,     0,     0,     0,    39,   437,     0,    68,
         596])]


## Tweet

In [13]:
# extraction

tweet_feat_bow_stop = []

d = ' '.join(df_tweet['tweet'])

tokens = nltk.word_tokenize(d)
tokens_stop = []

for t in tokens:
    t = t.lower()

    if t not in stopword_set:
        tokens_stop.append(t)

tweet_feat_bow_stop.append(' '.join(tokens_stop))

print(tweet_feat_bow_stop)

["semoga qurban pengorbanan diterima allah swt pahala menyelamatkan kelak . haturkan mohon maaf lahir bathin . hatur nuhun . https : //t.co/bf9obneb2p alhamdulillah , proses badal haji menghajikan nama eril tunaikan . niat proses wajib rukun sunnahnya . semoga haji mabrur . selamat idul adha 1443 h , haturkan umat islam merayakan . mohon maaf lahir batin . https : //t.co/cnk7ka0xqy suasana indonesia tanah suci . mengecek kondisi jamaah , serasa kampung , mekkah berjumpa dirindukan melebihi istri the one and only : bala-bala . # jabarjuara # indonesiajuara https : //t.co/i9vec88cjl insya allah , pulang tanah air predikat hajjah menyempurnakan rukun islam ke-5 nya . bersyukurlah keberkahan . bersabarlah sat kemusibahan . berikhtiarlah mengejar tujuan berdoalah perlindungan . https : //t.co/5s37tpnuo6 untungnya , islam kemudahan rukhsah berhalangan . tim haji jawa barat mengantisipasi hal-hal , personel pengganti winingsih syariat badal haji dihajikan . menyemangati winingsih , salah jema

In [14]:
# vectorization

tweet_feat_bow_stop_vec = get_vec(tweet_feat_bow_stop_vectorizer, tweet_feat_bow_stop)

print(tweet_feat_bow_stop_vec)

[[2 1 0 ... 0 0 0]]


# Main

In [30]:
path = '../model/'
model = pickle.load(open(path + 'Ablation bio_Gradient Boosting_train_test_split.sav', 'rb'))
X = np.hstack((color_feat, ia_with_dict, username_feat_char, network_feat_abl_likes, behavior_feat_abl_mention, socio_feat_abl_verb_count, tweet_feat_bow_stop_vec))
model.predict(X)

array(['pria'], dtype='<U6')