# 03. Feature Engineering

In [1]:
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

In [2]:
data_path = '../02_text_preprocessing/02_pickle/02_clean_data.pickle'
with open(data_path, 'rb') as data_path:
    data = pickle.load(data_path)

data.head()

Unnamed: 0,clean_tickets,OPD
0,mohon informasi kk e ktp daftar dukcapil kk da...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...
1,warga lurah karangbesuki kerja bekas juli istr...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...
2,selamat sore warga kel pisang candi kec sukun ...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...
3,mohon informasi blanko e ktp pakai surat teran...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...
4,rubah akte salah tulis anak tulis anak minggu ...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...


## Label encoding

In [3]:
category_codes = {
    'DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA MALANG': 0,
    'DINAS PEKERJAAN UMUM DAN PENATAAN RUANG KOTA MALANG': 1,
    'DINAS LINGKUNGAN HIDUP KOTA MALANG': 2,
    'DINAS PENDIDIKAN KOTA MALANG': 3,
    'DINAS PERUMAHAN DAN KAWASAN PERMUKIMAN KOTA MALANG': 4,
    'SATUAN POLISI PAMONG PRAJA KOTA MALANG': 5,
    'DINAS PERHUBUNGAN KOTA MALANG': 6
}

data['OPD_label'] = LabelEncoder().fit_transform(data['OPD'])
data.head()

Unnamed: 0,clean_tickets,OPD,OPD_label
0,mohon informasi kk e ktp daftar dukcapil kk da...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...,0
1,warga lurah karangbesuki kerja bekas juli istr...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...,0
2,selamat sore warga kel pisang candi kec sukun ...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...,0
3,mohon informasi blanko e ktp pakai surat teran...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...,0
4,rubah akte salah tulis anak tulis anak minggu ...,DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA M...,0


## Split train-test data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['clean_tickets'],
                                                    data['OPD_label'], 
                                                    test_size=0.25, 
                                                    random_state=8)

## Text representation: TF-IDF

In [5]:
# TF-IDF parameter
ngram_range = (1,2)
min_df = 1
max_df = 1.0
max_features = 300

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(150, 300)
(50, 300)


In [6]:
for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KOTA MALANG' category:
  . Most correlated unigrams:
. layan
. akta
. urus
. dispenduk
. ektp
  . Most correlated bigrams:
. dispenduk kota
. akta lahir

# 'DINAS LINGKUNGAN HIDUP KOTA MALANG' category:
  . Most correlated unigrams:
. jalan
. kondisi
. rusak
. aspal
. lubang
  . Most correlated bigrams:
. rusak parah
. lubang jalan

# 'DINAS PEKERJAAN UMUM DAN PENATAAN RUANG KOTA MALANG' category:
  . Most correlated unigrams:
. limbah
. sengat
. buang
. bau
. sampah
  . Most correlated bigrams:
. bakar sampah
. bau sengat

# 'DINAS PENDIDIKAN KOTA MALANG' category:
  . Most correlated unigrams:
. sma
. smp
. ppdb
. siswa
. sekolah
  . Most correlated bigrams:
. orang tua
. terima siswa

# 'DINAS PERHUBUNGAN KOTA MALANG' category:
  . Most correlated unigrams:
. bangun
. ptp
. izin
. petas
. razia
  . Most correlated bigrams:
. jam malam
. tolong razia

# 'DINAS PERUMAHAN DAN KAWASAN PERMUKIMAN KOTA MALANG' category:
  . Most correlated unigra

In [7]:
with open('03_pickle/03_data.pickle', 'wb') as output:
    pickle.dump(data, output)

with open('03_pickle/03_X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)

with open('03_pickle/03_X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)

with open('03_pickle/03_y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)

with open('03_pickle/03_y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)

with open('03_pickle/03_features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

with open('03_pickle/03_labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

with open('03_pickle/03_features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

with open('03_pickle/03_labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)

with open('03_pickle/03_tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)